aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp
diff options
context:
space:
mode:
authorvvvv <vvvv@ydb.tech>2023-07-31 18:21:04 +0300
committervvvv <vvvv@ydb.tech>2023-07-31 18:21:04 +0300
commitdec41c40e51aa407edef81a3c566a5a15780fc49 (patch)
tree4f197b596b32f35eca368121f0dff913419da9af /library/cpp
parent3ca8b54c96e09eb2b65be7f09675623438d559c7 (diff)
downloadydb-dec41c40e51aa407edef81a3c566a5a15780fc49.tar.gz
YQL-16239 Move purecalc to public
Diffstat (limited to 'library/cpp')
-rw-r--r--library/cpp/CMakeLists.darwin-x86_64.txt9
-rw-r--r--library/cpp/CMakeLists.linux-aarch64.txt9
-rw-r--r--library/cpp/CMakeLists.linux-x86_64.txt9
-rw-r--r--library/cpp/CMakeLists.windows-x86_64.txt9
-rw-r--r--library/cpp/containers/CMakeLists.txt1
-rw-r--r--library/cpp/containers/str_hash/CMakeLists.darwin-x86_64.txt19
-rw-r--r--library/cpp/containers/str_hash/CMakeLists.linux-aarch64.txt20
-rw-r--r--library/cpp/containers/str_hash/CMakeLists.linux-x86_64.txt20
-rw-r--r--library/cpp/containers/str_hash/CMakeLists.txt17
-rw-r--r--library/cpp/containers/str_hash/CMakeLists.windows-x86_64.txt19
-rw-r--r--library/cpp/containers/str_hash/str_hash.cpp60
-rw-r--r--library/cpp/containers/str_hash/str_hash.h181
-rw-r--r--library/cpp/containers/str_hash/ya.make12
-rw-r--r--library/cpp/deprecated/CMakeLists.txt4
-rw-r--r--library/cpp/deprecated/autoarray/CMakeLists.darwin-x86_64.txt17
-rw-r--r--library/cpp/deprecated/autoarray/CMakeLists.linux-aarch64.txt18
-rw-r--r--library/cpp/deprecated/autoarray/CMakeLists.linux-x86_64.txt18
-rw-r--r--library/cpp/deprecated/autoarray/CMakeLists.txt17
-rw-r--r--library/cpp/deprecated/autoarray/CMakeLists.windows-x86_64.txt17
-rw-r--r--library/cpp/deprecated/autoarray/README.md3
-rw-r--r--library/cpp/deprecated/autoarray/autoarray.cpp1
-rw-r--r--library/cpp/deprecated/autoarray/autoarray.h264
-rw-r--r--library/cpp/deprecated/autoarray/ya.make7
-rw-r--r--library/cpp/deprecated/datafile/CMakeLists.darwin-x86_64.txt19
-rw-r--r--library/cpp/deprecated/datafile/CMakeLists.linux-aarch64.txt20
-rw-r--r--library/cpp/deprecated/datafile/CMakeLists.linux-x86_64.txt20
-rw-r--r--library/cpp/deprecated/datafile/CMakeLists.txt17
-rw-r--r--library/cpp/deprecated/datafile/CMakeLists.windows-x86_64.txt19
-rw-r--r--library/cpp/deprecated/datafile/README.md3
-rw-r--r--library/cpp/deprecated/datafile/datafile.cpp42
-rw-r--r--library/cpp/deprecated/datafile/datafile.h88
-rw-r--r--library/cpp/deprecated/datafile/loadmode.cpp1
-rw-r--r--library/cpp/deprecated/datafile/loadmode.h20
-rw-r--r--library/cpp/deprecated/datafile/ya.make12
-rw-r--r--library/cpp/deprecated/fgood/CMakeLists.darwin-x86_64.txt18
-rw-r--r--library/cpp/deprecated/fgood/CMakeLists.linux-aarch64.txt19
-rw-r--r--library/cpp/deprecated/fgood/CMakeLists.linux-x86_64.txt19
-rw-r--r--library/cpp/deprecated/fgood/CMakeLists.txt17
-rw-r--r--library/cpp/deprecated/fgood/CMakeLists.windows-x86_64.txt18
-rw-r--r--library/cpp/deprecated/fgood/README.md15
-rw-r--r--library/cpp/deprecated/fgood/ffb.cpp407
-rw-r--r--library/cpp/deprecated/fgood/ffb.h264
-rw-r--r--library/cpp/deprecated/fgood/fgood.cpp70
-rw-r--r--library/cpp/deprecated/fgood/fgood.h328
-rw-r--r--library/cpp/deprecated/fgood/fput.h79
-rw-r--r--library/cpp/deprecated/fgood/ya.make8
-rw-r--r--library/cpp/deprecated/mapped_file/CMakeLists.darwin-x86_64.txt17
-rw-r--r--library/cpp/deprecated/mapped_file/CMakeLists.linux-aarch64.txt18
-rw-r--r--library/cpp/deprecated/mapped_file/CMakeLists.linux-x86_64.txt18
-rw-r--r--library/cpp/deprecated/mapped_file/CMakeLists.txt17
-rw-r--r--library/cpp/deprecated/mapped_file/CMakeLists.windows-x86_64.txt17
-rw-r--r--library/cpp/deprecated/mapped_file/mapped_file.cpp64
-rw-r--r--library/cpp/deprecated/mapped_file/ya.make7
-rw-r--r--library/cpp/geo/CMakeLists.darwin-x86_64.txt24
-rw-r--r--library/cpp/geo/CMakeLists.linux-aarch64.txt25
-rw-r--r--library/cpp/geo/CMakeLists.linux-x86_64.txt25
-rw-r--r--library/cpp/geo/CMakeLists.txt17
-rw-r--r--library/cpp/geo/CMakeLists.windows-x86_64.txt24
-rw-r--r--library/cpp/geo/bbox.cpp1
-rw-r--r--library/cpp/geo/bbox.h59
-rw-r--r--library/cpp/geo/geo.cpp1
-rw-r--r--library/cpp/geo/geo.h8
-rw-r--r--library/cpp/geo/load_save_helper.cpp49
-rw-r--r--library/cpp/geo/load_save_helper.h23
-rw-r--r--library/cpp/geo/point.cpp146
-rw-r--r--library/cpp/geo/point.h198
-rw-r--r--library/cpp/geo/polygon.cpp28
-rw-r--r--library/cpp/geo/polygon.h90
-rw-r--r--library/cpp/geo/size.cpp31
-rw-r--r--library/cpp/geo/size.h93
-rw-r--r--library/cpp/geo/style/ya.make8
-rw-r--r--library/cpp/geo/ut/load_save_helper_ut.cpp90
-rw-r--r--library/cpp/geo/ut/point_ut.cpp171
-rw-r--r--library/cpp/geo/ut/polygon_ut.cpp34
-rw-r--r--library/cpp/geo/ut/size_ut.cpp29
-rw-r--r--library/cpp/geo/ut/util_ut.cpp36
-rw-r--r--library/cpp/geo/ut/window_ut.cpp547
-rw-r--r--library/cpp/geo/ut/ya.make12
-rw-r--r--library/cpp/geo/util.cpp34
-rw-r--r--library/cpp/geo/util.h107
-rw-r--r--library/cpp/geo/window.cpp297
-rw-r--r--library/cpp/geo/window.h264
-rw-r--r--library/cpp/geo/ya.make19
-rw-r--r--library/cpp/geobase/CMakeLists.darwin-x86_64.txt30
-rw-r--r--library/cpp/geobase/CMakeLists.linux-aarch64.txt31
-rw-r--r--library/cpp/geobase/CMakeLists.linux-x86_64.txt31
-rw-r--r--library/cpp/geobase/CMakeLists.txt17
-rw-r--r--library/cpp/geobase/CMakeLists.windows-x86_64.txt30
-rw-r--r--library/cpp/geobase/geobase.cpp3
-rw-r--r--library/cpp/geobase/lookup.hpp44
-rw-r--r--library/cpp/geobase/service_getter.hpp7
-rw-r--r--library/cpp/geobase/timezone_getter.hpp9
-rw-r--r--library/cpp/geobase/ya.make13
-rw-r--r--library/cpp/geohash/CMakeLists.darwin-x86_64.txt32
-rw-r--r--library/cpp/geohash/CMakeLists.linux-aarch64.txt33
-rw-r--r--library/cpp/geohash/CMakeLists.linux-x86_64.txt33
-rw-r--r--library/cpp/geohash/CMakeLists.txt17
-rw-r--r--library/cpp/geohash/CMakeLists.windows-x86_64.txt32
-rw-r--r--library/cpp/geohash/direction.h14
-rw-r--r--library/cpp/geohash/geohash.cpp413
-rw-r--r--library/cpp/geohash/geohash.h123
-rw-r--r--library/cpp/geohash/ya.make13
-rw-r--r--library/cpp/ipreg/CMakeLists.darwin-x86_64.txt53
-rw-r--r--library/cpp/ipreg/CMakeLists.linux-aarch64.txt54
-rw-r--r--library/cpp/ipreg/CMakeLists.linux-x86_64.txt54
-rw-r--r--library/cpp/ipreg/CMakeLists.txt17
-rw-r--r--library/cpp/ipreg/CMakeLists.windows-x86_64.txt53
-rw-r--r--library/cpp/ipreg/address.cpp365
-rw-r--r--library/cpp/ipreg/address.h137
-rw-r--r--library/cpp/ipreg/checker.cpp47
-rw-r--r--library/cpp/ipreg/checker.h37
-rw-r--r--library/cpp/ipreg/merge.cpp69
-rw-r--r--library/cpp/ipreg/merge.h11
-rw-r--r--library/cpp/ipreg/range.cpp198
-rw-r--r--library/cpp/ipreg/range.h50
-rw-r--r--library/cpp/ipreg/reader.cpp82
-rw-r--r--library/cpp/ipreg/reader.h57
-rw-r--r--library/cpp/ipreg/sources.cpp100
-rw-r--r--library/cpp/ipreg/sources.h53
-rw-r--r--library/cpp/ipreg/split.cpp54
-rw-r--r--library/cpp/ipreg/split.h13
-rw-r--r--library/cpp/ipreg/stopwatch.cpp53
-rw-r--r--library/cpp/ipreg/stopwatch.h25
-rw-r--r--library/cpp/ipreg/util_helpers.cpp705
-rw-r--r--library/cpp/ipreg/util_helpers.h65
-rw-r--r--library/cpp/ipreg/writer.cpp91
-rw-r--r--library/cpp/ipreg/writer.h62
-rw-r--r--library/cpp/ipreg/ya.make26
-rw-r--r--library/cpp/langmask/CMakeLists.txt9
-rw-r--r--library/cpp/langmask/proto/CMakeLists.darwin-x86_64.txt43
-rw-r--r--library/cpp/langmask/proto/CMakeLists.linux-aarch64.txt44
-rw-r--r--library/cpp/langmask/proto/CMakeLists.linux-x86_64.txt44
-rw-r--r--library/cpp/langmask/proto/CMakeLists.txt17
-rw-r--r--library/cpp/langmask/proto/CMakeLists.windows-x86_64.txt43
-rw-r--r--library/cpp/langmask/proto/langmask.proto6
-rw-r--r--library/cpp/langmask/proto/ya.make11
-rw-r--r--library/cpp/microbdb/CMakeLists.darwin-x86_64.txt56
-rw-r--r--library/cpp/microbdb/CMakeLists.linux-aarch64.txt57
-rw-r--r--library/cpp/microbdb/CMakeLists.linux-x86_64.txt57
-rw-r--r--library/cpp/microbdb/CMakeLists.txt17
-rw-r--r--library/cpp/microbdb/CMakeLists.windows-x86_64.txt56
-rw-r--r--library/cpp/microbdb/align.h17
-rw-r--r--library/cpp/microbdb/compressed.h520
-rw-r--r--library/cpp/microbdb/extinfo.h127
-rw-r--r--library/cpp/microbdb/file.cpp220
-rw-r--r--library/cpp/microbdb/file.h225
-rw-r--r--library/cpp/microbdb/hashes.h250
-rw-r--r--library/cpp/microbdb/header.cpp91
-rw-r--r--library/cpp/microbdb/header.h159
-rw-r--r--library/cpp/microbdb/heap.h143
-rw-r--r--library/cpp/microbdb/input.h1027
-rw-r--r--library/cpp/microbdb/microbdb.cpp1
-rw-r--r--library/cpp/microbdb/microbdb.h54
-rw-r--r--library/cpp/microbdb/noextinfo.proto4
-rw-r--r--library/cpp/microbdb/output.h1049
-rw-r--r--library/cpp/microbdb/powersorter.h667
-rw-r--r--library/cpp/microbdb/reader.h354
-rw-r--r--library/cpp/microbdb/safeopen.h792
-rw-r--r--library/cpp/microbdb/sorter.h677
-rw-r--r--library/cpp/microbdb/sorterdef.h19
-rw-r--r--library/cpp/microbdb/utility.h75
-rw-r--r--library/cpp/microbdb/wrappers.h637
-rw-r--r--library/cpp/microbdb/ya.make36
-rw-r--r--library/cpp/on_disk/CMakeLists.txt1
-rw-r--r--library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt18
-rw-r--r--library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt19
-rw-r--r--library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt19
-rw-r--r--library/cpp/on_disk/st_hash/CMakeLists.txt17
-rw-r--r--library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt18
-rw-r--r--library/cpp/on_disk/st_hash/fake.cpp4
-rw-r--r--library/cpp/on_disk/st_hash/save_stl.h84
-rw-r--r--library/cpp/on_disk/st_hash/static_hash.h420
-rw-r--r--library/cpp/on_disk/st_hash/static_hash_map.h59
-rw-r--r--library/cpp/on_disk/st_hash/sthash_iterators.h334
-rw-r--r--library/cpp/on_disk/st_hash/ya.make15
-rw-r--r--library/cpp/regex/CMakeLists.darwin-x86_64.txt1
-rw-r--r--library/cpp/regex/CMakeLists.linux-aarch64.txt1
-rw-r--r--library/cpp/regex/CMakeLists.linux-x86_64.txt1
-rw-r--r--library/cpp/regex/CMakeLists.windows-x86_64.txt1
-rw-r--r--library/cpp/regex/glob/CMakeLists.darwin-x86_64.txt19
-rw-r--r--library/cpp/regex/glob/CMakeLists.linux-aarch64.txt20
-rw-r--r--library/cpp/regex/glob/CMakeLists.linux-x86_64.txt20
-rw-r--r--library/cpp/regex/glob/CMakeLists.txt17
-rw-r--r--library/cpp/regex/glob/CMakeLists.windows-x86_64.txt19
-rw-r--r--library/cpp/regex/glob/glob.cpp921
-rw-r--r--library/cpp/regex/glob/glob_compat.h73
-rw-r--r--library/cpp/regex/glob/glob_iterator.cpp1
-rw-r--r--library/cpp/regex/glob/glob_iterator.h36
-rw-r--r--library/cpp/regex/glob/ya.make12
-rw-r--r--library/cpp/reverse_geocoder/CMakeLists.txt11
-rw-r--r--library/cpp/reverse_geocoder/core/CMakeLists.darwin-x86_64.txt35
-rw-r--r--library/cpp/reverse_geocoder/core/CMakeLists.linux-aarch64.txt36
-rw-r--r--library/cpp/reverse_geocoder/core/CMakeLists.linux-x86_64.txt36
-rw-r--r--library/cpp/reverse_geocoder/core/CMakeLists.txt17
-rw-r--r--library/cpp/reverse_geocoder/core/CMakeLists.windows-x86_64.txt35
-rw-r--r--library/cpp/reverse_geocoder/core/area_box.cpp9
-rw-r--r--library/cpp/reverse_geocoder/core/area_box.h34
-rw-r--r--library/cpp/reverse_geocoder/core/bbox.cpp1
-rw-r--r--library/cpp/reverse_geocoder/core/bbox.h66
-rw-r--r--library/cpp/reverse_geocoder/core/common.cpp1
-rw-r--r--library/cpp/reverse_geocoder/core/common.h24
-rw-r--r--library/cpp/reverse_geocoder/core/edge.cpp1
-rw-r--r--library/cpp/reverse_geocoder/core/edge.h101
-rw-r--r--library/cpp/reverse_geocoder/core/geo_data/debug.cpp74
-rw-r--r--library/cpp/reverse_geocoder/core/geo_data/debug.h16
-rw-r--r--library/cpp/reverse_geocoder/core/geo_data/def.cpp1
-rw-r--r--library/cpp/reverse_geocoder/core/geo_data/def.h35
-rw-r--r--library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp1
-rw-r--r--library/cpp/reverse_geocoder/core/geo_data/geo_data.h24
-rw-r--r--library/cpp/reverse_geocoder/core/geo_data/map.cpp203
-rw-r--r--library/cpp/reverse_geocoder/core/geo_data/map.h89
-rw-r--r--library/cpp/reverse_geocoder/core/geo_data/proxy.cpp1
-rw-r--r--library/cpp/reverse_geocoder/core/geo_data/proxy.h68
-rw-r--r--library/cpp/reverse_geocoder/core/kv.cpp1
-rw-r--r--library/cpp/reverse_geocoder/core/kv.h13
-rw-r--r--library/cpp/reverse_geocoder/core/location.cpp1
-rw-r--r--library/cpp/reverse_geocoder/core/location.h21
-rw-r--r--library/cpp/reverse_geocoder/core/part.cpp29
-rw-r--r--library/cpp/reverse_geocoder/core/part.h26
-rw-r--r--library/cpp/reverse_geocoder/core/point.cpp1
-rw-r--r--library/cpp/reverse_geocoder/core/point.h52
-rw-r--r--library/cpp/reverse_geocoder/core/polygon.cpp91
-rw-r--r--library/cpp/reverse_geocoder/core/polygon.h73
-rw-r--r--library/cpp/reverse_geocoder/core/region.cpp1
-rw-r--r--library/cpp/reverse_geocoder/core/region.h37
-rw-r--r--library/cpp/reverse_geocoder/core/reverse_geocoder.cpp182
-rw-r--r--library/cpp/reverse_geocoder/core/reverse_geocoder.h73
-rw-r--r--library/cpp/reverse_geocoder/core/ya.make28
-rw-r--r--library/cpp/reverse_geocoder/library/CMakeLists.darwin-x86_64.txt21
-rw-r--r--library/cpp/reverse_geocoder/library/CMakeLists.linux-aarch64.txt22
-rw-r--r--library/cpp/reverse_geocoder/library/CMakeLists.linux-x86_64.txt22
-rw-r--r--library/cpp/reverse_geocoder/library/CMakeLists.txt17
-rw-r--r--library/cpp/reverse_geocoder/library/CMakeLists.windows-x86_64.txt21
-rw-r--r--library/cpp/reverse_geocoder/library/block_allocator.cpp40
-rw-r--r--library/cpp/reverse_geocoder/library/block_allocator.h64
-rw-r--r--library/cpp/reverse_geocoder/library/fs.cpp18
-rw-r--r--library/cpp/reverse_geocoder/library/fs.h19
-rw-r--r--library/cpp/reverse_geocoder/library/log.cpp111
-rw-r--r--library/cpp/reverse_geocoder/library/log.h65
-rw-r--r--library/cpp/reverse_geocoder/library/memory.h23
-rw-r--r--library/cpp/reverse_geocoder/library/pool_allocator.cpp17
-rw-r--r--library/cpp/reverse_geocoder/library/pool_allocator.h42
-rw-r--r--library/cpp/reverse_geocoder/library/system.h3
-rw-r--r--library/cpp/reverse_geocoder/library/unaligned_iter.cpp1
-rw-r--r--library/cpp/reverse_geocoder/library/unaligned_iter.h64
-rw-r--r--library/cpp/reverse_geocoder/library/ya.make11
-rw-r--r--library/cpp/reverse_geocoder/proto/CMakeLists.darwin-x86_64.txt56
-rw-r--r--library/cpp/reverse_geocoder/proto/CMakeLists.linux-aarch64.txt57
-rw-r--r--library/cpp/reverse_geocoder/proto/CMakeLists.linux-x86_64.txt57
-rw-r--r--library/cpp/reverse_geocoder/proto/CMakeLists.txt17
-rw-r--r--library/cpp/reverse_geocoder/proto/CMakeLists.windows-x86_64.txt56
-rw-r--r--library/cpp/reverse_geocoder/proto/geo_data.proto42
-rw-r--r--library/cpp/reverse_geocoder/proto/region.proto32
-rw-r--r--library/cpp/reverse_geocoder/proto/ya.make10
-rw-r--r--library/cpp/robots_txt/CMakeLists.darwin-x86_64.txt26
-rw-r--r--library/cpp/robots_txt/CMakeLists.linux-aarch64.txt27
-rw-r--r--library/cpp/robots_txt/CMakeLists.linux-x86_64.txt27
-rw-r--r--library/cpp/robots_txt/CMakeLists.txt17
-rw-r--r--library/cpp/robots_txt/CMakeLists.windows-x86_64.txt26
-rw-r--r--library/cpp/robots_txt/constants.h9
-rw-r--r--library/cpp/robots_txt/prefix_tree.cpp172
-rw-r--r--library/cpp/robots_txt/prefix_tree.h47
-rw-r--r--library/cpp/robots_txt/prefix_tree_rules_handler.cpp706
-rw-r--r--library/cpp/robots_txt/robots_txt.h605
-rw-r--r--library/cpp/robots_txt/robots_txt_parser.cpp116
-rw-r--r--library/cpp/robots_txt/robots_txt_parser.h38
-rw-r--r--library/cpp/robots_txt/robotstxtcfg.h3
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/CMakeLists.darwin-x86_64.txt20
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-aarch64.txt21
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-x86_64.txt21
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/CMakeLists.txt17
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/CMakeLists.windows-x86_64.txt20
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp2
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/bot_id_set.h132
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp2
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h11
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/user_agents.cpp2
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/user_agents.h303
-rw-r--r--library/cpp/robots_txt/robotstxtcfg/ya.make13
-rw-r--r--library/cpp/robots_txt/rules_handler.cpp514
-rw-r--r--library/cpp/robots_txt/ya.make18
-rw-r--r--library/cpp/yconf/CMakeLists.darwin-x86_64.txt19
-rw-r--r--library/cpp/yconf/CMakeLists.linux-aarch64.txt20
-rw-r--r--library/cpp/yconf/CMakeLists.linux-x86_64.txt20
-rw-r--r--library/cpp/yconf/CMakeLists.txt17
-rw-r--r--library/cpp/yconf/CMakeLists.windows-x86_64.txt19
286 files changed, 23598 insertions, 0 deletions
diff --git a/library/cpp/CMakeLists.darwin-x86_64.txt b/library/cpp/CMakeLists.darwin-x86_64.txt
index 772027a342..5497fd21be 100644
--- a/library/cpp/CMakeLists.darwin-x86_64.txt
+++ b/library/cpp/CMakeLists.darwin-x86_64.txt
@@ -36,6 +36,9 @@ add_subdirectory(disjoint_sets)
add_subdirectory(dns)
add_subdirectory(enumbitset)
add_subdirectory(execprofile)
+add_subdirectory(geo)
+add_subdirectory(geobase)
+add_subdirectory(geohash)
add_subdirectory(getopt)
add_subdirectory(grpc)
add_subdirectory(histogram)
@@ -44,9 +47,11 @@ add_subdirectory(http)
add_subdirectory(hyperloglog)
add_subdirectory(int128)
add_subdirectory(ipmath)
+add_subdirectory(ipreg)
add_subdirectory(ipv6_address)
add_subdirectory(iterator)
add_subdirectory(json)
+add_subdirectory(langmask)
add_subdirectory(lcs)
add_subdirectory(lfalloc)
add_subdirectory(linear_regression)
@@ -55,6 +60,7 @@ add_subdirectory(lua)
add_subdirectory(lwtrace)
add_subdirectory(malloc)
add_subdirectory(messagebus)
+add_subdirectory(microbdb)
add_subdirectory(mime)
add_subdirectory(monlib)
add_subdirectory(on_disk)
@@ -68,6 +74,8 @@ add_subdirectory(random_provider)
add_subdirectory(regex)
add_subdirectory(resource)
add_subdirectory(retry)
+add_subdirectory(reverse_geocoder)
+add_subdirectory(robots_txt)
add_subdirectory(sanitizer)
add_subdirectory(scheme)
add_subdirectory(sighandler)
@@ -90,6 +98,7 @@ add_subdirectory(unified_agent_client)
add_subdirectory(uri)
add_subdirectory(xml)
add_subdirectory(yaml)
+add_subdirectory(yconf)
add_subdirectory(yson)
add_subdirectory(yson_pull)
add_subdirectory(yt)
diff --git a/library/cpp/CMakeLists.linux-aarch64.txt b/library/cpp/CMakeLists.linux-aarch64.txt
index cd50b0e3a4..5e93629802 100644
--- a/library/cpp/CMakeLists.linux-aarch64.txt
+++ b/library/cpp/CMakeLists.linux-aarch64.txt
@@ -35,6 +35,9 @@ add_subdirectory(disjoint_sets)
add_subdirectory(dns)
add_subdirectory(enumbitset)
add_subdirectory(execprofile)
+add_subdirectory(geo)
+add_subdirectory(geobase)
+add_subdirectory(geohash)
add_subdirectory(getopt)
add_subdirectory(grpc)
add_subdirectory(histogram)
@@ -43,9 +46,11 @@ add_subdirectory(http)
add_subdirectory(hyperloglog)
add_subdirectory(int128)
add_subdirectory(ipmath)
+add_subdirectory(ipreg)
add_subdirectory(ipv6_address)
add_subdirectory(iterator)
add_subdirectory(json)
+add_subdirectory(langmask)
add_subdirectory(lcs)
add_subdirectory(lfalloc)
add_subdirectory(linear_regression)
@@ -54,6 +59,7 @@ add_subdirectory(lua)
add_subdirectory(lwtrace)
add_subdirectory(malloc)
add_subdirectory(messagebus)
+add_subdirectory(microbdb)
add_subdirectory(mime)
add_subdirectory(monlib)
add_subdirectory(on_disk)
@@ -67,6 +73,8 @@ add_subdirectory(random_provider)
add_subdirectory(regex)
add_subdirectory(resource)
add_subdirectory(retry)
+add_subdirectory(reverse_geocoder)
+add_subdirectory(robots_txt)
add_subdirectory(sanitizer)
add_subdirectory(scheme)
add_subdirectory(sighandler)
@@ -89,6 +97,7 @@ add_subdirectory(unified_agent_client)
add_subdirectory(uri)
add_subdirectory(xml)
add_subdirectory(yaml)
+add_subdirectory(yconf)
add_subdirectory(yson)
add_subdirectory(yson_pull)
add_subdirectory(yt)
diff --git a/library/cpp/CMakeLists.linux-x86_64.txt b/library/cpp/CMakeLists.linux-x86_64.txt
index 772027a342..5497fd21be 100644
--- a/library/cpp/CMakeLists.linux-x86_64.txt
+++ b/library/cpp/CMakeLists.linux-x86_64.txt
@@ -36,6 +36,9 @@ add_subdirectory(disjoint_sets)
add_subdirectory(dns)
add_subdirectory(enumbitset)
add_subdirectory(execprofile)
+add_subdirectory(geo)
+add_subdirectory(geobase)
+add_subdirectory(geohash)
add_subdirectory(getopt)
add_subdirectory(grpc)
add_subdirectory(histogram)
@@ -44,9 +47,11 @@ add_subdirectory(http)
add_subdirectory(hyperloglog)
add_subdirectory(int128)
add_subdirectory(ipmath)
+add_subdirectory(ipreg)
add_subdirectory(ipv6_address)
add_subdirectory(iterator)
add_subdirectory(json)
+add_subdirectory(langmask)
add_subdirectory(lcs)
add_subdirectory(lfalloc)
add_subdirectory(linear_regression)
@@ -55,6 +60,7 @@ add_subdirectory(lua)
add_subdirectory(lwtrace)
add_subdirectory(malloc)
add_subdirectory(messagebus)
+add_subdirectory(microbdb)
add_subdirectory(mime)
add_subdirectory(monlib)
add_subdirectory(on_disk)
@@ -68,6 +74,8 @@ add_subdirectory(random_provider)
add_subdirectory(regex)
add_subdirectory(resource)
add_subdirectory(retry)
+add_subdirectory(reverse_geocoder)
+add_subdirectory(robots_txt)
add_subdirectory(sanitizer)
add_subdirectory(scheme)
add_subdirectory(sighandler)
@@ -90,6 +98,7 @@ add_subdirectory(unified_agent_client)
add_subdirectory(uri)
add_subdirectory(xml)
add_subdirectory(yaml)
+add_subdirectory(yconf)
add_subdirectory(yson)
add_subdirectory(yson_pull)
add_subdirectory(yt)
diff --git a/library/cpp/CMakeLists.windows-x86_64.txt b/library/cpp/CMakeLists.windows-x86_64.txt
index 772027a342..5497fd21be 100644
--- a/library/cpp/CMakeLists.windows-x86_64.txt
+++ b/library/cpp/CMakeLists.windows-x86_64.txt
@@ -36,6 +36,9 @@ add_subdirectory(disjoint_sets)
add_subdirectory(dns)
add_subdirectory(enumbitset)
add_subdirectory(execprofile)
+add_subdirectory(geo)
+add_subdirectory(geobase)
+add_subdirectory(geohash)
add_subdirectory(getopt)
add_subdirectory(grpc)
add_subdirectory(histogram)
@@ -44,9 +47,11 @@ add_subdirectory(http)
add_subdirectory(hyperloglog)
add_subdirectory(int128)
add_subdirectory(ipmath)
+add_subdirectory(ipreg)
add_subdirectory(ipv6_address)
add_subdirectory(iterator)
add_subdirectory(json)
+add_subdirectory(langmask)
add_subdirectory(lcs)
add_subdirectory(lfalloc)
add_subdirectory(linear_regression)
@@ -55,6 +60,7 @@ add_subdirectory(lua)
add_subdirectory(lwtrace)
add_subdirectory(malloc)
add_subdirectory(messagebus)
+add_subdirectory(microbdb)
add_subdirectory(mime)
add_subdirectory(monlib)
add_subdirectory(on_disk)
@@ -68,6 +74,8 @@ add_subdirectory(random_provider)
add_subdirectory(regex)
add_subdirectory(resource)
add_subdirectory(retry)
+add_subdirectory(reverse_geocoder)
+add_subdirectory(robots_txt)
add_subdirectory(sanitizer)
add_subdirectory(scheme)
add_subdirectory(sighandler)
@@ -90,6 +98,7 @@ add_subdirectory(unified_agent_client)
add_subdirectory(uri)
add_subdirectory(xml)
add_subdirectory(yaml)
+add_subdirectory(yconf)
add_subdirectory(yson)
add_subdirectory(yson_pull)
add_subdirectory(yt)
diff --git a/library/cpp/containers/CMakeLists.txt b/library/cpp/containers/CMakeLists.txt
index 43fcbe8346..40f5013867 100644
--- a/library/cpp/containers/CMakeLists.txt
+++ b/library/cpp/containers/CMakeLists.txt
@@ -20,5 +20,6 @@ add_subdirectory(ring_buffer)
add_subdirectory(sorted_vector)
add_subdirectory(stack_array)
add_subdirectory(stack_vector)
+add_subdirectory(str_hash)
add_subdirectory(str_map)
add_subdirectory(top_keeper)
diff --git a/library/cpp/containers/str_hash/CMakeLists.darwin-x86_64.txt b/library/cpp/containers/str_hash/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..627814f0ed
--- /dev/null
+++ b/library/cpp/containers/str_hash/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-containers-str_hash)
+target_link_libraries(cpp-containers-str_hash PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+ cpp-containers-str_map
+)
+target_sources(cpp-containers-str_hash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/containers/str_hash/str_hash.cpp
+)
diff --git a/library/cpp/containers/str_hash/CMakeLists.linux-aarch64.txt b/library/cpp/containers/str_hash/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..cd723cbea2
--- /dev/null
+++ b/library/cpp/containers/str_hash/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,20 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-containers-str_hash)
+target_link_libraries(cpp-containers-str_hash PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+ cpp-containers-str_map
+)
+target_sources(cpp-containers-str_hash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/containers/str_hash/str_hash.cpp
+)
diff --git a/library/cpp/containers/str_hash/CMakeLists.linux-x86_64.txt b/library/cpp/containers/str_hash/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..cd723cbea2
--- /dev/null
+++ b/library/cpp/containers/str_hash/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,20 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-containers-str_hash)
+target_link_libraries(cpp-containers-str_hash PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+ cpp-containers-str_map
+)
+target_sources(cpp-containers-str_hash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/containers/str_hash/str_hash.cpp
+)
diff --git a/library/cpp/containers/str_hash/CMakeLists.txt b/library/cpp/containers/str_hash/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/containers/str_hash/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/containers/str_hash/CMakeLists.windows-x86_64.txt b/library/cpp/containers/str_hash/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..627814f0ed
--- /dev/null
+++ b/library/cpp/containers/str_hash/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-containers-str_hash)
+target_link_libraries(cpp-containers-str_hash PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+ cpp-containers-str_map
+)
+target_sources(cpp-containers-str_hash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/containers/str_hash/str_hash.cpp
+)
diff --git a/library/cpp/containers/str_hash/str_hash.cpp b/library/cpp/containers/str_hash/str_hash.cpp
new file mode 100644
index 0000000000..1298638533
--- /dev/null
+++ b/library/cpp/containers/str_hash/str_hash.cpp
@@ -0,0 +1,60 @@
+#include "str_hash.h"
+
+#include <library/cpp/charset/ci_string.h>
+#include <util/stream/output.h>
+#include <util/stream/input.h>
+
+HashSet::HashSet(const char** array, size_type size) {
+ Resize(size);
+ while (*array && **array)
+ AddPermanent(*array++);
+}
+
+void HashSet::Read(IInputStream* input) {
+ TString s;
+
+ while (input->ReadLine(s)) {
+ AddUniq(TCiString(s).c_str());
+ }
+}
+
+void HashSet::Write(IOutputStream* output) const {
+ for (const auto& it : *this) {
+ *output << it.first << "\n";
+ }
+}
+
+#ifdef TEST_STRHASH
+#include <ctime>
+#include <fstream>
+#include <cstdio>
+#include <cstdlib>
+
+using namespace std;
+
+int main(int argc, char* argv[]) {
+ if (argc < 2) {
+ printf("usage: stoplist <stop-words file ...\n");
+ exit(EXIT_FAILURE); // FreeBSD: EX_USAGE
+ }
+ Hash hash;
+ hash.Read(cin);
+ for (--argc, ++argv; argc > 0; --argc, ++argv) {
+ ifstream input(argv[0]);
+ if (!input.good()) {
+ perror(argv[0]);
+ continue;
+ }
+ TCiString s;
+ while (input >> s) {
+ if (!hash.Has(s))
+ cout << s << "\n";
+ else
+ cout << "[[" << s << "]]"
+ << "\n";
+ }
+ }
+ return EXIT_SUCCESS; // EX_OK
+}
+
+#endif
diff --git a/library/cpp/containers/str_hash/str_hash.h b/library/cpp/containers/str_hash/str_hash.h
new file mode 100644
index 0000000000..25f960dbb5
--- /dev/null
+++ b/library/cpp/containers/str_hash/str_hash.h
@@ -0,0 +1,181 @@
+#pragma once
+
+#include <library/cpp/containers/str_map/str_map.h>
+#include <library/cpp/charset/ci_string.h>
+#include <util/system/yassert.h>
+#include <util/memory/tempbuf.h>
+
+#include <memory>
+
+class IInputStream;
+class IOutputStream;
+
+template <class T, class Alloc = std::allocator<const char*>>
+class Hash;
+
+struct yvoid {
+ yvoid() = default;
+};
+
+template <typename T, class Alloc>
+class Hash: public string_hash<T, ci_hash, ci_equal_to, Alloc> {
+ using ci_string_hash = string_hash<T, ci_hash, ci_equal_to, Alloc>;
+
+protected:
+ using ci_string_hash::pool;
+
+public:
+ using size_type = typename ci_string_hash::size_type;
+ using const_iterator = typename ci_string_hash::const_iterator;
+ using iterator = typename ci_string_hash::iterator;
+ using value_type = typename ci_string_hash::value_type;
+ using ci_string_hash::begin;
+ using ci_string_hash::end;
+ using ci_string_hash::find;
+ using ci_string_hash::size;
+
+ Hash()
+ : ci_string_hash()
+ {
+ }
+ explicit Hash(size_type theSize)
+ : ci_string_hash(theSize, theSize * AVERAGEWORD_BUF)
+ {
+ }
+ Hash(const char** strings, size_type size = 0, T* = 0); // must end with NULL or "\0"
+ virtual ~Hash();
+ bool Has(const char* s, size_t len, T* pp = nullptr) const;
+ bool Has(const char* s, T* pp = nullptr) const {
+ const_iterator it;
+ if ((it = find(s)) == end())
+ return false;
+ else if (pp)
+ *pp = (*it).second;
+ return true;
+ }
+ void Add(const char* s, T data) {
+ // in fact it is the same insert_unique as in AddUnique.
+ // it's impossible to have _FAST_ version of insert() in 'hash_map'
+
+ // you have to use 'hash_mmap' to get the _kind_ of desired effect.
+ // BUT still there will be "Checks" inside -
+ // to make the same keys close to each other (see insert_equal())
+ this->insert_copy(s, data);
+ }
+ bool AddUniq(const char* s, T data) {
+ return this->insert_copy(s, data).second;
+ }
+ // new function to get rid of allocations completely! -- e.g. in constructors
+ void AddPermanent(const char* s, T data) {
+ this->insert(value_type(s, data));
+ }
+ T Detach(const char* s) {
+ iterator it = find(s);
+ if (it == end())
+ return T();
+ T data = (*it).second;
+ this->erase(it);
+ return data;
+ }
+ size_type NumEntries() const {
+ return size();
+ }
+ bool ForEach(bool (*func)(const char* key, T data, void* cookie), void* cookie = nullptr);
+ void Resize(size_type theSize) {
+ this->reserve(theSize);
+ // no pool resizing here.
+ }
+ virtual void Clear();
+ char* Pool() {
+ if (pool.Size() < 2 || pool.End()[-2] != '\0')
+ pool.Append("\0", 1);
+ return pool.Begin();
+ }
+};
+
+template <class T, class Alloc>
+Hash<T, Alloc>::Hash(const char** array, size_type theSize, T* data) {
+ // must end with NULL or "\0"
+ Y_ASSERT(data != nullptr);
+ Resize(theSize);
+ while (*array && **array)
+ AddPermanent(*array++, *data++);
+}
+
+template <class T, class Alloc>
+bool Hash<T, Alloc>::Has(const char* s, size_t len, T* pp) const {
+ TTempArray<char> buf(len + 1);
+ char* const allocated = buf.Data();
+ memcpy(allocated, s, len);
+ allocated[len] = '\x00';
+ return Has(allocated, pp);
+}
+
+template <class T, class Alloc>
+Hash<T, Alloc>::~Hash() {
+ Clear();
+}
+
+template <class T, class Alloc>
+void Hash<T, Alloc>::Clear() {
+ ci_string_hash::clear_hash(); // to make the key pool empty
+}
+
+template <class T, class Alloc>
+bool Hash<T, Alloc>::ForEach(bool (*func)(const char* key, T data, void* cookie), void* cookie) {
+ for (const_iterator it = begin(); it != end(); ++it)
+ if (!func((*it).first, (*it).second, cookie))
+ return false;
+ return true;
+}
+
+class HashSet: public Hash<yvoid> {
+public:
+ HashSet(const char** array, size_type size = 0);
+ HashSet()
+ : Hash<yvoid>()
+ {
+ }
+ void Read(IInputStream* input);
+ void Write(IOutputStream* output) const;
+ void Add(const char* s) {
+ // in fact it is the same insert_unique as in AddUnique.
+ // it's impossible to have _FAST_ version of insert() in 'hash_map'
+
+ // you have to use 'hash_mmap' to get the _kind_ of desired effect.
+ // BUT still there will be "Checks" inside -
+ // to make the same keys close to each other (see insert_equal())
+ insert_copy(s, yvoid());
+ }
+ bool AddUniq(const char* s) {
+ return insert_copy(s, yvoid()).second;
+ }
+ // new function to get rid of allocations completely! -- e.g. in constructors
+ void AddPermanent(const char* s) {
+ insert(value_type(s, yvoid()));
+ }
+};
+
+template <class T, class HashFcn = THash<T>, class EqualKey = TEqualTo<T>, class Alloc = std::allocator<T>>
+class TStaticHash: private THashMap<T, T, HashFcn, EqualKey> {
+private:
+ using TBase = THashMap<T, T, HashFcn, EqualKey>;
+
+public:
+ TStaticHash(T arr[][2], size_t size) {
+ TBase::reserve(size);
+ while (size) {
+ TBase::insert(typename TBase::value_type(arr[0][0], arr[0][1]));
+ arr++;
+ size--;
+ }
+ }
+ T operator[](const T& key) const { // !!! it is not lvalue nor it used to be
+ typename TBase::const_iterator it = TBase::find(key);
+ if (it == TBase::end())
+ return nullptr;
+ return it->second;
+ }
+};
+
+using TStHash = TStaticHash<const char*, ci_hash, ci_equal_to>;
diff --git a/library/cpp/containers/str_hash/ya.make b/library/cpp/containers/str_hash/ya.make
new file mode 100644
index 0000000000..f7e24316b9
--- /dev/null
+++ b/library/cpp/containers/str_hash/ya.make
@@ -0,0 +1,12 @@
+LIBRARY()
+
+PEERDIR(
+ library/cpp/charset
+ library/cpp/containers/str_map
+)
+
+SRCS(
+ str_hash.cpp
+)
+
+END()
diff --git a/library/cpp/deprecated/CMakeLists.txt b/library/cpp/deprecated/CMakeLists.txt
index ad818e3662..765ea6aad7 100644
--- a/library/cpp/deprecated/CMakeLists.txt
+++ b/library/cpp/deprecated/CMakeLists.txt
@@ -8,6 +8,10 @@
add_subdirectory(accessors)
add_subdirectory(atomic)
+add_subdirectory(autoarray)
+add_subdirectory(datafile)
add_subdirectory(enum_codegen)
+add_subdirectory(fgood)
add_subdirectory(kmp)
+add_subdirectory(mapped_file)
add_subdirectory(split)
diff --git a/library/cpp/deprecated/autoarray/CMakeLists.darwin-x86_64.txt b/library/cpp/deprecated/autoarray/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..f2a246218c
--- /dev/null
+++ b/library/cpp/deprecated/autoarray/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-autoarray)
+target_link_libraries(cpp-deprecated-autoarray PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-autoarray PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/autoarray/autoarray.cpp
+)
diff --git a/library/cpp/deprecated/autoarray/CMakeLists.linux-aarch64.txt b/library/cpp/deprecated/autoarray/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..2411a48cd3
--- /dev/null
+++ b/library/cpp/deprecated/autoarray/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,18 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-autoarray)
+target_link_libraries(cpp-deprecated-autoarray PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-autoarray PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/autoarray/autoarray.cpp
+)
diff --git a/library/cpp/deprecated/autoarray/CMakeLists.linux-x86_64.txt b/library/cpp/deprecated/autoarray/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..2411a48cd3
--- /dev/null
+++ b/library/cpp/deprecated/autoarray/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,18 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-autoarray)
+target_link_libraries(cpp-deprecated-autoarray PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-autoarray PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/autoarray/autoarray.cpp
+)
diff --git a/library/cpp/deprecated/autoarray/CMakeLists.txt b/library/cpp/deprecated/autoarray/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/deprecated/autoarray/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/deprecated/autoarray/CMakeLists.windows-x86_64.txt b/library/cpp/deprecated/autoarray/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..f2a246218c
--- /dev/null
+++ b/library/cpp/deprecated/autoarray/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-autoarray)
+target_link_libraries(cpp-deprecated-autoarray PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-autoarray PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/autoarray/autoarray.cpp
+)
diff --git a/library/cpp/deprecated/autoarray/README.md b/library/cpp/deprecated/autoarray/README.md
new file mode 100644
index 0000000000..1d83147cee
--- /dev/null
+++ b/library/cpp/deprecated/autoarray/README.md
@@ -0,0 +1,3 @@
+Pre-C++11 vector-like container.
+
+Just use std::vector. If you need to fill your vector with custom-constructed data, use reserve+emplace_back (but make sure that your elements are movable).
diff --git a/library/cpp/deprecated/autoarray/autoarray.cpp b/library/cpp/deprecated/autoarray/autoarray.cpp
new file mode 100644
index 0000000000..15167f27f6
--- /dev/null
+++ b/library/cpp/deprecated/autoarray/autoarray.cpp
@@ -0,0 +1 @@
+#include "autoarray.h"
diff --git a/library/cpp/deprecated/autoarray/autoarray.h b/library/cpp/deprecated/autoarray/autoarray.h
new file mode 100644
index 0000000000..2aa12c5916
--- /dev/null
+++ b/library/cpp/deprecated/autoarray/autoarray.h
@@ -0,0 +1,264 @@
+#pragma once
+
+#include <util/system/compat.h>
+#include <util/system/yassert.h>
+#include <util/system/defaults.h>
+#include <util/system/sys_alloc.h>
+
+#include <util/generic/typetraits.h>
+#include <utility>
+
+#include <new>
+#include <util/generic/noncopyable.h>
+
+struct autoarray_getindex {
+ autoarray_getindex() = default;
+};
+
+struct aarr_b0 {
+ aarr_b0() = default;
+};
+
+struct aarr_nofill {
+ aarr_nofill() = default;
+};
+
+template <typename T>
+struct ynd_type_traits {
+ enum {
+ empty_destructor = TTypeTraits<T>::IsPod,
+ };
+};
+
+template <class T>
+class autoarray : TNonCopyable {
+protected:
+ T* arr;
+ size_t _size;
+
+private:
+ void AllocBuf(size_t siz) {
+ arr = nullptr;
+ _size = 0;
+ if (siz) {
+ arr = (T*)y_allocate(sizeof(T) * siz);
+ _size = siz;
+ }
+ }
+
+public:
+ using value_type = T;
+ using iterator = T*;
+ using const_iterator = const T*;
+
+ autoarray()
+ : arr(nullptr)
+ , _size(0)
+ {
+ }
+ autoarray(size_t siz) {
+ AllocBuf(siz);
+ T* curr = arr;
+ try {
+ for (T* end = arr + _size; curr != end; ++curr)
+ new (curr) T();
+ } catch (...) {
+ for (--curr; curr >= arr; --curr)
+ curr->~T();
+ y_deallocate(arr);
+ throw;
+ }
+ }
+ template <class A>
+ explicit autoarray(size_t siz, A& fill) {
+ AllocBuf(siz);
+ T* curr = arr;
+ try {
+ for (T* end = arr + _size; curr != end; ++curr)
+ new (curr) T(fill);
+ } catch (...) {
+ for (--curr; curr >= arr; --curr)
+ curr->~T();
+ y_deallocate(arr);
+ throw;
+ }
+ }
+ explicit autoarray(size_t siz, autoarray_getindex) {
+ AllocBuf(siz);
+ size_t nCurrent = 0;
+ try {
+ for (nCurrent = 0; nCurrent < _size; ++nCurrent)
+ new (&arr[nCurrent]) T(nCurrent);
+ } catch (...) {
+ for (size_t n = 0; n < nCurrent; ++n)
+ arr[n].~T();
+ y_deallocate(arr);
+ throw;
+ }
+ }
+ explicit autoarray(size_t siz, aarr_b0) {
+ AllocBuf(siz);
+ memset(arr, 0, _size * sizeof(T));
+ }
+ explicit autoarray(size_t siz, aarr_nofill) {
+ AllocBuf(siz);
+ }
+ template <class A>
+ explicit autoarray(const A* fill, size_t siz) {
+ AllocBuf(siz);
+ size_t nCurrent = 0;
+ try {
+ for (nCurrent = 0; nCurrent < _size; ++nCurrent)
+ new (&arr[nCurrent]) T(fill[nCurrent]);
+ } catch (...) {
+ for (size_t n = 0; n < nCurrent; ++n)
+ arr[n].~T();
+ y_deallocate(arr);
+ throw;
+ }
+ }
+ template <class A, class B>
+ explicit autoarray(const A* fill, const B* cfill, size_t siz) {
+ AllocBuf(siz);
+ size_t nCurrent = 0;
+ try {
+ for (nCurrent = 0; nCurrent < _size; ++nCurrent)
+ new (&arr[nCurrent]) T(fill[nCurrent], cfill);
+ } catch (...) {
+ for (size_t n = 0; n < nCurrent; ++n)
+ arr[n].~T();
+ y_deallocate(arr);
+ throw;
+ }
+ }
+ template <class A>
+ explicit autoarray(const A* fill, size_t initsiz, size_t fullsiz) {
+ AllocBuf(fullsiz);
+ size_t nCurrent = 0;
+ try {
+ for (nCurrent = 0; nCurrent < ((initsiz < _size) ? initsiz : _size); ++nCurrent)
+ new (&arr[nCurrent]) T(fill[nCurrent]);
+ for (; nCurrent < _size; ++nCurrent)
+ new (&arr[nCurrent]) T();
+ } catch (...) {
+ for (size_t n = 0; n < nCurrent; ++n)
+ arr[n].~T();
+ y_deallocate(arr);
+ throw;
+ }
+ }
+ template <class A>
+ explicit autoarray(const A* fill, size_t initsiz, size_t fullsiz, const T& dummy) {
+ AllocBuf(fullsiz);
+ size_t nCurrent = 0;
+ try {
+ for (nCurrent = 0; nCurrent < ((initsiz < _size) ? initsiz : _size); ++nCurrent)
+ new (&arr[nCurrent]) T(fill[nCurrent]);
+ for (; nCurrent < _size; ++nCurrent)
+ new (&arr[nCurrent]) T(dummy);
+ } catch (...) {
+ for (size_t n = 0; n < nCurrent; ++n)
+ arr[n].~T();
+ y_deallocate(arr);
+ throw;
+ }
+ }
+
+ template <class... R>
+ explicit autoarray(size_t siz, R&&... fill) {
+ AllocBuf(siz);
+ T* curr = arr;
+ try {
+ for (T* end = arr + _size; curr != end; ++curr)
+ new (curr) T(std::forward<R>(fill)...);
+ } catch (...) {
+ for (--curr; curr >= arr; --curr)
+ curr->~T();
+ y_deallocate(arr);
+ throw;
+ }
+ }
+ ~autoarray() {
+ if (_size) {
+ if (!ynd_type_traits<T>::empty_destructor)
+ for (T *curr = arr, *end = arr + _size; curr != end; ++curr)
+ curr->~T();
+ y_deallocate(arr);
+ }
+ }
+ T& operator[](size_t pos) {
+ Y_ASSERT(pos < _size);
+ return arr[pos];
+ }
+ const T& operator[](size_t pos) const {
+ Y_ASSERT(pos < _size);
+ return arr[pos];
+ }
+ size_t size() const {
+ return _size;
+ }
+ void swap(autoarray& with) {
+ T* tmp_arr = arr;
+ size_t tmp_size = _size;
+ arr = with.arr;
+ _size = with._size;
+ with.arr = tmp_arr;
+ with._size = tmp_size;
+ }
+ void resize(size_t siz) {
+ autoarray<T> tmp(arr, _size, siz);
+ swap(tmp);
+ }
+ void resize(size_t siz, const T& dummy) {
+ autoarray<T> tmp(arr, _size, siz, dummy);
+ swap(tmp);
+ }
+ T* rawpointer() {
+ return arr;
+ }
+ const T* operator~() const {
+ return arr;
+ }
+ T* begin() {
+ return arr;
+ }
+ T* end() {
+ return arr + _size;
+ }
+ T& back() {
+ Y_ASSERT(_size);
+ return arr[_size - 1];
+ }
+ bool empty() const {
+ return !_size;
+ }
+ bool operator!() const {
+ return !_size;
+ }
+ size_t operator+() const {
+ return _size;
+ }
+ const T* begin() const {
+ return arr;
+ }
+ const T* end() const {
+ return arr + _size;
+ }
+ const T& back() const {
+ Y_ASSERT(_size);
+ return arr[_size - 1];
+ }
+ //operator T*() { return arr; }
+};
+
+template <class T>
+inline bool operator==(const autoarray<T>& a, const autoarray<T>& b) {
+ size_t count = a.size();
+ if (count != b.size())
+ return false;
+ for (size_t i = 0; i < count; ++i) {
+ if (a[i] != b[i])
+ return false;
+ }
+ return true;
+}
diff --git a/library/cpp/deprecated/autoarray/ya.make b/library/cpp/deprecated/autoarray/ya.make
new file mode 100644
index 0000000000..4b055f8c29
--- /dev/null
+++ b/library/cpp/deprecated/autoarray/ya.make
@@ -0,0 +1,7 @@
+LIBRARY()
+
+SRCS(
+ autoarray.cpp
+)
+
+END()
diff --git a/library/cpp/deprecated/datafile/CMakeLists.darwin-x86_64.txt b/library/cpp/deprecated/datafile/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..3f88f788da
--- /dev/null
+++ b/library/cpp/deprecated/datafile/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-datafile)
+target_link_libraries(cpp-deprecated-datafile PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-deprecated-mapped_file
+)
+target_sources(cpp-deprecated-datafile PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/datafile.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/loadmode.cpp
+)
diff --git a/library/cpp/deprecated/datafile/CMakeLists.linux-aarch64.txt b/library/cpp/deprecated/datafile/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..43da9ae45a
--- /dev/null
+++ b/library/cpp/deprecated/datafile/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,20 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-datafile)
+target_link_libraries(cpp-deprecated-datafile PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ cpp-deprecated-mapped_file
+)
+target_sources(cpp-deprecated-datafile PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/datafile.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/loadmode.cpp
+)
diff --git a/library/cpp/deprecated/datafile/CMakeLists.linux-x86_64.txt b/library/cpp/deprecated/datafile/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..43da9ae45a
--- /dev/null
+++ b/library/cpp/deprecated/datafile/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,20 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-datafile)
+target_link_libraries(cpp-deprecated-datafile PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ cpp-deprecated-mapped_file
+)
+target_sources(cpp-deprecated-datafile PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/datafile.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/loadmode.cpp
+)
diff --git a/library/cpp/deprecated/datafile/CMakeLists.txt b/library/cpp/deprecated/datafile/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/deprecated/datafile/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/deprecated/datafile/CMakeLists.windows-x86_64.txt b/library/cpp/deprecated/datafile/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..3f88f788da
--- /dev/null
+++ b/library/cpp/deprecated/datafile/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-datafile)
+target_link_libraries(cpp-deprecated-datafile PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-deprecated-mapped_file
+)
+target_sources(cpp-deprecated-datafile PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/datafile.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/datafile/loadmode.cpp
+)
diff --git a/library/cpp/deprecated/datafile/README.md b/library/cpp/deprecated/datafile/README.md
new file mode 100644
index 0000000000..7f8547108e
--- /dev/null
+++ b/library/cpp/deprecated/datafile/README.md
@@ -0,0 +1,3 @@
+A wrapper on top of some user-defined custom file format.
+
+Just write your own if you need it. It's going to be way easier than figuring out how to use this one.
diff --git a/library/cpp/deprecated/datafile/datafile.cpp b/library/cpp/deprecated/datafile/datafile.cpp
new file mode 100644
index 0000000000..ff93f11c6b
--- /dev/null
+++ b/library/cpp/deprecated/datafile/datafile.cpp
@@ -0,0 +1,42 @@
+#include "datafile.h"
+
+void TDataFileBase::DoLoad(const char* fname, int loadMode) {
+ Destroy();
+ TFile f(fname, RdOnly);
+ DoLoad(f, loadMode, nullptr, 0);
+}
+
+void TDataFileBase::DoLoad(TFile& f, int loadMode, void* hdrPtr, size_t hdrSize) {
+ if (hdrPtr) {
+ if (loadMode & DLM_EXACT_SIZE && f.GetLength() != (i64)Length)
+ throw yexception() << f.GetName() << " size does not match its header value";
+ } else {
+ Length = f.GetLength();
+ hdrSize = 0;
+ }
+ if ((loadMode & DLM_LD_TYPE_MASK) == DLM_READ) {
+ MemData = TVector<char>(Length);
+ memcpy(MemData.begin(), hdrPtr, hdrSize);
+ f.Load(MemData.begin() + hdrSize, Length - hdrSize);
+ Start = MemData.begin();
+ } else {
+ FileData.init(f);
+ if (FileData.getSize() < Length)
+ throw yexception() << f.GetName() << " is smaller than what its header value says";
+ if ((loadMode & DLM_LD_TYPE_MASK) == DLM_MMAP_PRC)
+ FileData.precharge();
+ Start = (const char*)FileData.getData();
+ }
+}
+
+void TDataFileBase::Destroy() {
+ TVector<char>().swap(MemData);
+ FileData.term();
+ Start = nullptr;
+ Length = 0;
+}
+
+void TDataFileBase::Precharge() const {
+ if (Length && Start == (char*)FileData.getData())
+ FileData.precharge();
+}
diff --git a/library/cpp/deprecated/datafile/datafile.h b/library/cpp/deprecated/datafile/datafile.h
new file mode 100644
index 0000000000..a438baceca
--- /dev/null
+++ b/library/cpp/deprecated/datafile/datafile.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include "loadmode.h"
+
+#include <library/cpp/deprecated/mapped_file/mapped_file.h>
+
+#include <util/generic/vector.h>
+#include <util/system/file.h>
+#include <util/system/filemap.h>
+
+/** Simple helper that allows a file to be either mapped or read into malloc'ed memory.
+ This behaviour is controlled by EDataLoadMode enum defined in loadmode.h.
+ Unlike TBlob it provides Precharge() function and simple file size - based integrity check.
+
+ To use this code, inherit your class from TDataFile<TFileHeader>.
+ TFileHeader must be a pod-type structure with byte layout of the file header.
+ File must start with that header.
+ TFileHeader must have FileSize() member function that determines expected file size or
+ length of data that need to be read from the beginning of file.
+ */
+
+class TDataFileBase {
+protected:
+ TVector<char> MemData;
+ TMappedFile FileData;
+
+ const char* Start;
+ size_t Length;
+
+ TDataFileBase()
+ : Start(nullptr)
+ , Length(0)
+ {
+ }
+
+ void DoLoad(TFile& f, int loadMode, void* hdrPtr, size_t hdrSize);
+ void DoLoad(const char* fname, int loadMode); // just whole file
+ void Destroy();
+ void swap(TDataFileBase& with) {
+ MemData.swap(with.MemData);
+ FileData.swap(with.FileData);
+ DoSwap(Start, with.Start);
+ DoSwap(Length, with.Length);
+ }
+
+public:
+ void Precharge() const;
+};
+
+template <class TFileHeader>
+class TDataFile: public TDataFileBase {
+protected:
+ void Load(const char* fname, EDataLoadMode loadMode) {
+ Destroy();
+ TFile f(fname, RdOnly | Seq);
+ TFileHeader hdr;
+ f.Load(&hdr, sizeof(hdr));
+ Length = hdr.FileSize();
+ DoLoad(f, (int)loadMode, &hdr, sizeof(hdr));
+ }
+ const TFileHeader& Hdr() const {
+ return *(TFileHeader*)Start;
+ }
+};
+
+// Use: class TFoo: public TDataFileEx<Foo> {...};
+// Additional requrement: TFileHeader must have Validate(fname) function that throws exception.
+// Class TUser itself must have Init(fname) function
+// Adds Load() function to your class (TUser)
+template <class TUser, class TFileHeader>
+class TDataFileEx: public TDataFile<TFileHeader> {
+private:
+ using TBase = TDataFile<TFileHeader>;
+ TUser& User() const {
+ return *(TUser*)this;
+ }
+
+public:
+ TDataFileEx(const char* fname, EDataLoadMode loadMode = DLM_DEFAULT) {
+ if (fname)
+ Load(fname, loadMode);
+ }
+ void Load(const char* fname, EDataLoadMode loadMode = DLM_DEFAULT) {
+ TBase::Load(fname, loadMode);
+ TBase::Hdr().Validate(fname);
+ User().Init(fname);
+ }
+};
diff --git a/library/cpp/deprecated/datafile/loadmode.cpp b/library/cpp/deprecated/datafile/loadmode.cpp
new file mode 100644
index 0000000000..a857830326
--- /dev/null
+++ b/library/cpp/deprecated/datafile/loadmode.cpp
@@ -0,0 +1 @@
+#include "loadmode.h"
diff --git a/library/cpp/deprecated/datafile/loadmode.h b/library/cpp/deprecated/datafile/loadmode.h
new file mode 100644
index 0000000000..f04054dd64
--- /dev/null
+++ b/library/cpp/deprecated/datafile/loadmode.h
@@ -0,0 +1,20 @@
+#pragma once
+
+// It is recommended to support all reasonal value combinations via this enum,
+// to let Load() function argument be of EDataLoadMode type, not just int type
+
+enum EDataLoadMode {
+ DLM_READ = 0,
+ DLM_MMAP_PRC = 1, // precharge
+ DLM_MMAP = 2, // w/o precharge
+ DLM_MMAP_AUTO_PRC = 3, // precharge automatically (same as DLM_MMAP unless specifically supported)
+ DLM_LD_TYPE_MASK = 15,
+ DLM_EXACT_SIZE = 16, // fail if input file is larger than what header says
+
+ DLM_READ_ESZ = DLM_READ | DLM_EXACT_SIZE,
+ DLM_MMAP_PRC_ESZ = DLM_MMAP_PRC | DLM_EXACT_SIZE,
+ DLM_MMAP_ESZ = DLM_MMAP | DLM_EXACT_SIZE,
+ DLM_MMAP_APRC_ESZ = DLM_MMAP_AUTO_PRC | DLM_EXACT_SIZE,
+
+ DLM_DEFAULT = DLM_MMAP_PRC_ESZ,
+};
diff --git a/library/cpp/deprecated/datafile/ya.make b/library/cpp/deprecated/datafile/ya.make
new file mode 100644
index 0000000000..1ad4fe9bc7
--- /dev/null
+++ b/library/cpp/deprecated/datafile/ya.make
@@ -0,0 +1,12 @@
+LIBRARY()
+
+SRCS(
+ datafile.cpp
+ loadmode.cpp
+)
+
+PEERDIR(
+ library/cpp/deprecated/mapped_file
+)
+
+END()
diff --git a/library/cpp/deprecated/fgood/CMakeLists.darwin-x86_64.txt b/library/cpp/deprecated/fgood/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..a82750e559
--- /dev/null
+++ b/library/cpp/deprecated/fgood/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,18 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-fgood)
+target_link_libraries(cpp-deprecated-fgood PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-fgood PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/ffb.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/fgood.cpp
+)
diff --git a/library/cpp/deprecated/fgood/CMakeLists.linux-aarch64.txt b/library/cpp/deprecated/fgood/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..52e29348fd
--- /dev/null
+++ b/library/cpp/deprecated/fgood/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-fgood)
+target_link_libraries(cpp-deprecated-fgood PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-fgood PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/ffb.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/fgood.cpp
+)
diff --git a/library/cpp/deprecated/fgood/CMakeLists.linux-x86_64.txt b/library/cpp/deprecated/fgood/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..52e29348fd
--- /dev/null
+++ b/library/cpp/deprecated/fgood/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-fgood)
+target_link_libraries(cpp-deprecated-fgood PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-fgood PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/ffb.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/fgood.cpp
+)
diff --git a/library/cpp/deprecated/fgood/CMakeLists.txt b/library/cpp/deprecated/fgood/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/deprecated/fgood/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/deprecated/fgood/CMakeLists.windows-x86_64.txt b/library/cpp/deprecated/fgood/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..a82750e559
--- /dev/null
+++ b/library/cpp/deprecated/fgood/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,18 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-fgood)
+target_link_libraries(cpp-deprecated-fgood PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-fgood PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/ffb.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/fgood/fgood.cpp
+)
diff --git a/library/cpp/deprecated/fgood/README.md b/library/cpp/deprecated/fgood/README.md
new file mode 100644
index 0000000000..4f66289657
--- /dev/null
+++ b/library/cpp/deprecated/fgood/README.md
@@ -0,0 +1,15 @@
+Some ancient wrappers on top of FILE*, and some string manupulation functions.
+
+Alternatives are as follows.
+
+For TFILEPtr. Use TIFStream or TOFStream if you need IO. For some rare use cases a TFileMap might also do.
+
+For fput/fget/getline. Use streams API.
+
+For struct ffb and struct prnstr. Just don't use them. Even if you can figure out what they do.
+
+For sf family of functions and TLineSplitter. Just use Split* from util/string/split.h
+
+For TSFReader. Use TMapTsvFile.
+
+For read_or_die family of functions. Use streams API.
diff --git a/library/cpp/deprecated/fgood/ffb.cpp b/library/cpp/deprecated/fgood/ffb.cpp
new file mode 100644
index 0000000000..aa9da861a6
--- /dev/null
+++ b/library/cpp/deprecated/fgood/ffb.cpp
@@ -0,0 +1,407 @@
+#include "ffb.h"
+
+#include <util/string/util.h> // str_spn
+#include <util/system/compat.h>
+#include <util/generic/yexception.h>
+
+#include <cstdio>
+#include <algorithm>
+
+#include <ctype.h>
+
+#ifdef _win_
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+
+ffb::ffb(FILE* file)
+ : TFILEPtr(file)
+{
+ if (file && !isatty(fileno(file)) && BUFSIZ < 512 * 1024)
+ setvbuf(file, nullptr, _IOFBF, 512 * 1024);
+}
+
+void ffb::operator=(FILE* f) {
+ TFILEPtr::operator=(f);
+ if (f && !isatty(fileno(f)) && BUFSIZ < 512 * 1024)
+ setvbuf(f, nullptr, _IOFBF, 512 * 1024);
+}
+
+void ffb::open(const char* name, const char* mode) {
+ TFILEPtr::open(name, mode);
+ if (!isatty(fileno(*this)) && BUFSIZ < 512 * 1024)
+ setvbuf(*this, nullptr, _IOFBF, 512 * 1024);
+}
+
+int sf(char** fb, char* buf) { //don't want to call sf(fb, buf, 32)
+ if (!(*buf && *buf != 10)) {
+ *fb = nullptr;
+ return 0;
+ }
+ int n = 1;
+ fb[0] = buf;
+ while (*buf && *buf != 10 && n < 31) {
+ if (*buf == '\t') {
+ *buf++ = 0;
+ fb[n++] = buf;
+ continue;
+ }
+ buf++;
+ }
+ if (*buf == 10 && buf[-1] == 13)
+ buf[-1] = 0;
+ *buf = 0;
+ fb[n] = nullptr;
+ return n;
+}
+
+int sf(char** fb, char* buf, size_t fb_sz) {
+ if (!(*buf && *buf != 10)) {
+ *fb = nullptr;
+ return 0;
+ }
+ fb_sz--;
+ int n = 1;
+ fb[0] = buf;
+ while (*buf && *buf != 10 && n < (int)fb_sz) {
+ if (*buf == '\t') {
+ *buf++ = 0;
+ fb[n++] = buf;
+ continue;
+ }
+ buf++;
+ }
+ if (*buf == 10 && buf[-1] == 13)
+ buf[-1] = 0;
+ *buf = 0;
+ fb[n] = nullptr;
+ return n;
+}
+
+inline int sf_blank(char** fb, char* buf, size_t fb_sz) {
+ while (isspace((ui8)*buf))
+ buf++;
+ if (!*buf) {
+ *fb = nullptr;
+ return 0;
+ }
+ fb_sz--;
+ int n = 1;
+ fb[0] = buf;
+ while (*buf && *buf != 10 && n < (int)fb_sz) {
+ if (isspace((ui8)*buf)) {
+ *buf++ = 0;
+ while (isspace((ui8)*buf))
+ buf++;
+ if (*buf)
+ fb[n++] = buf;
+ continue;
+ }
+ buf++;
+ }
+ if (*buf == 10 && buf[-1] == 13)
+ buf[-1] = 0;
+ *buf = 0;
+ fb[n] = nullptr;
+ return n;
+}
+
+int sf(char fs, char** fb, char* buf, size_t fb_sz) {
+ if (fs == ' ')
+ return sf_blank(fb, buf, fb_sz);
+ while (*buf == fs)
+ buf++;
+ if (!(*buf && *buf != 10)) {
+ *fb = nullptr;
+ return 0;
+ }
+ fb_sz--;
+ int n = 1;
+ fb[0] = buf;
+ while (*buf && *buf != 10 && n < (int)fb_sz) {
+ if (*buf == fs) {
+ *buf++ = 0;
+ while (*buf == fs)
+ buf++;
+ fb[n++] = buf;
+ continue;
+ }
+ buf++;
+ }
+ if (*buf == 10 && buf[-1] == 13)
+ buf[-1] = 0;
+ *buf = 0;
+ fb[n] = nullptr;
+ return n;
+}
+
+int sf(const char* fs, char** fb, char* buf, size_t fb_sz) {
+ if (!(*buf && *buf != 10)) {
+ *fb = nullptr;
+ return 0;
+ }
+ int fs_len = strlen(fs);
+ fb_sz--;
+ int n = 1;
+ fb[0] = buf;
+ while (*buf && *buf != 10 && n < (int)fb_sz) {
+ if (*buf == *fs && !strncmp(buf + 1, fs + 1, fs_len - 1)) {
+ *buf = 0;
+ buf += fs_len;
+ fb[n++] = buf;
+ continue;
+ }
+ buf++;
+ }
+ if (*buf == 10 && buf[-1] == 13)
+ buf[-1] = 0;
+ *buf = 0;
+ fb[n] = nullptr;
+ return n;
+}
+
+inline bool is_end(const char* p) {
+ return !p || !p[0];
+}
+
+int sf(const char* seps, char* buf, char** fb, size_t fb_sz) {
+ if (fb_sz < 1 || is_end(buf)) {
+ *fb = nullptr;
+ return 0;
+ }
+ str_spn sseps(seps);
+ fb[0] = nullptr;
+ int n = 0;
+ // skip leading delimeters
+ buf = sseps.cbrk(buf);
+ if (is_end(buf))
+ return 0;
+ // store fields
+ while (n < (int)fb_sz) {
+ fb[n++] = buf;
+ // find delimeters
+ buf = sseps.brk(buf + 1);
+ if (is_end(buf))
+ break;
+ *buf = 0;
+ // skip delimiters
+ buf = sseps.cbrk(buf + 1);
+ if (is_end(buf))
+ break;
+ }
+ fb[n] = nullptr;
+ return n;
+}
+
+void TLineSplitter::operator()(char* p, TVector<char*>& fields) const {
+ if (!p || !*p)
+ return;
+ char* q = p;
+ while (1) {
+ p = Sep.brk(p);
+ if (q && (p - q || !SkipEmpty()))
+ fields.push_back(q);
+ q = nullptr;
+ if (!*p)
+ break;
+ if (SepStrLen == 1 || (SepStrLen > 1 && !strncmp(p + 1, SepStr + 1, SepStrLen - 1))) {
+ *p = 0;
+ p += SepStrLen;
+ q = p;
+ } else
+ p++;
+ }
+}
+
+void TLineSplitter::operator()(const char* p, TVector<std::pair<const char*, size_t>>& fields) const {
+ if (!p || !*p)
+ return;
+ const char* q = p;
+ while (1) {
+ p = Sep.brk(p);
+ if (q && (p - q || !SkipEmpty()))
+ fields.push_back(std::make_pair(q, p - q));
+ q = nullptr;
+ if (!*p)
+ break;
+ if (SepStrLen == 1 || (SepStrLen > 1 && !strncmp(p + 1, SepStr + 1, SepStrLen - 1))) {
+ p += SepStrLen;
+ q = p;
+ } else
+ p++;
+ }
+}
+
+TSFReader::TSFReader(const char* fname, char sep, i32 nfrq) // if sep == ' ' isspace will be imitated (for compat)
+ : Split(str_spn(sep == ' ' ? "\t\n\v\f\r " : TString(1, sep).data()), sep == ' ')
+ , OpenPipe(false)
+{
+ Open(fname, nfrq);
+}
+
+TSFReader::TSFReader(const char* fname, const char* sep, i32 nfrq)
+ : Split(sep, false)
+ , OpenPipe(false)
+{
+ Open(fname, nfrq);
+}
+
+TSFReader::TSFReader(const char* fname, const TLineSplitter& spl, i32 nfrq)
+ : Split(spl)
+ , OpenPipe(false)
+{
+ Open(fname, nfrq);
+}
+
+void TSFReader::Open(const char* fname, i32 nfrq, size_t vbuf_size) {
+ FieldsRequired = nfrq;
+ NF = NR = 0;
+
+ if (IsOpen())
+ File.close();
+
+ if (!fname)
+ return;
+
+ if (!strcmp(fname, "/dev/stdin")) {
+ File.assign(stdin, "/dev/stdin");
+ } else {
+ if (OpenPipe)
+ File.popen(fname, "r");
+ else
+ File.open(fname, "r");
+ }
+ OpenPipe = false;
+ if (!isatty(fileno(File)))
+ setvbuf(File, nullptr, _IOFBF, vbuf_size);
+}
+
+void TSFReader::Popen(const char* pname, i32 nfrq, size_t vbuf_size) {
+ OpenPipe = true;
+ Open(pname, nfrq, vbuf_size);
+}
+
+bool TSFReader::NextLine(segmented_string_pool* pool) {
+ size_t line_len = 0;
+
+#ifdef __FreeBSD__
+ char* ptr = fgetln(File, &line_len);
+ if (!ptr)
+ return false;
+ if (!line_len || ptr[line_len - 1] != '\n') { // last line w/o newline
+ Buf.AssignNoAlias(ptr, line_len);
+ ptr = Buf.begin();
+ } else {
+ // can safely replace newline with \0
+ ptr[line_len - 1] = 0;
+ --line_len;
+ }
+#else
+ if (!getline(File, Buf))
+ return false;
+ char* ptr = Buf.begin();
+ line_len = Buf.size();
+#endif
+ if (line_len && ptr[line_len - 1] == '\r')
+ ptr[line_len - 1] = 0;
+
+ if (pool) {
+ char* nptr = pool->append(ptr);
+ Y_ASSERT(!strcmp(ptr, nptr));
+ ptr = nptr;
+ }
+
+ ++NR;
+ Fields.clear();
+ Split(ptr, Fields);
+ NF = Fields.size();
+
+ if (FieldsRequired != -1 && FieldsRequired != (int)NF)
+ ythrow yexception() << File.name() << " line " << NR << ": " << NF << " fields, expected " << FieldsRequired;
+
+ return true;
+}
+
+int prnstr::f(const char* c, ...) {
+ va_list params;
+ int n = asize - pos, k;
+ va_start(params, c);
+ while ((k = vsnprintf(buf + pos, n, c, params)) >= n) {
+ n += asize, asize *= 2;
+ while (k + pos >= n)
+ n += asize, asize *= 2;
+ char* t = new char[asize];
+ memcpy(t, buf, pos);
+ delete[] buf;
+ buf = t;
+ va_end(params);
+ va_start(params, c);
+ }
+ pos += k;
+ va_end(params);
+ return k;
+}
+int prnstr::s(const char* c, size_t k) {
+ if (!c)
+ return 0;
+ size_t n = asize - pos;
+ if (k >= n) {
+ n += asize, asize *= 2;
+ while (k + pos >= n)
+ n += asize, asize *= 2;
+ char* t = new char[asize];
+ memcpy(t, buf, pos);
+ delete[] buf;
+ buf = t;
+ }
+ memcpy(buf + pos, c, k);
+ pos += k;
+ buf[pos] = 0;
+ return k;
+}
+void prnstr::clear() {
+ pos = 0;
+ if (asize > 32768) {
+ asize = 32768;
+ delete[] buf;
+ buf = new char[asize];
+ }
+}
+
+void prnstr::swap(prnstr& w) {
+ std::swap(buf, w.buf);
+ std::swap(pos, w.pos);
+ std::swap(asize, w.asize);
+}
+
+FILE* read_or_die(const char* fname) {
+ FILE* f = fopen(fname, "rb");
+ if (!f)
+ err(1, "%s", fname);
+ return f;
+}
+FILE* write_or_die(const char* fname) {
+ FILE* f = fopen(fname, "wb");
+ if (!f)
+ err(1, "%s", fname);
+ return f;
+}
+FILE* fopen_or_die(const char* fname, const char* mode) {
+ FILE* f = fopen(fname, mode);
+ if (!f)
+ err(1, "%s (mode '%s')", fname, mode);
+ return f;
+}
+
+FILE* fopen_chk(const char* fname, const char* mode) {
+ FILE* f = fopen(fname, mode);
+ if (!f)
+ ythrow yexception() << fname << " (mode '" << mode << "'): " << LastSystemErrorText();
+ return f;
+}
+
+void fclose_chk(FILE* f, const char* fname) {
+ if (fclose(f))
+ ythrow yexception() << "file " << fname << ": " << LastSystemErrorText();
+}
diff --git a/library/cpp/deprecated/fgood/ffb.h b/library/cpp/deprecated/fgood/ffb.h
new file mode 100644
index 0000000000..ca229eb65a
--- /dev/null
+++ b/library/cpp/deprecated/fgood/ffb.h
@@ -0,0 +1,264 @@
+#pragma once
+
+#include "fgood.h"
+
+#include <util/string/util.h> // str_spn
+#include <util/string/split.h> // str_spn
+#include <util/memory/segmented_string_pool.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/generic/noncopyable.h>
+
+#include <utility>
+
+#include <cstdarg>
+#include <cstring>
+
+struct ffb: public TFILEPtr {
+ ffb() {
+ }
+ ffb(FILE* file);
+ ffb(const char* name, const char* mode) {
+ open(name, mode);
+ }
+ void operator=(FILE* f); // take ownership
+ void open(const char* name, const char* mode);
+ int f(const char* c, ...) {
+ va_list args;
+ va_start(args, c);
+ return vfprintf(*this, c, args);
+ }
+ void s(const char* c) {
+ fsput(c, strlen(c));
+ }
+ void b(const void* cc, int n) {
+ fsput((const char*)cc, n);
+ }
+ void B(const void* cc, int N) {
+ fsput((const char*)cc, N);
+ }
+ void c(char c) {
+ fputc(c);
+ }
+ void cbe(wchar16 c) { // big endian utf-16
+ fputc(char(c >> 8)); //Hi8
+ fputc(char(c & 255)); //Lo8
+ }
+ void sbe(const wchar16* c) {
+ for (; *c; c++)
+ cbe(*c);
+ }
+ void fclose() {
+ close();
+ }
+};
+
+// split fields of tab-delimited line of text
+// here and below fb actual size must be fb_sz + 1 to allow fb[fb_sz] be zero
+int sf(char** fb, char* buf, size_t fb_sz);
+int sf(char** fb, char* buf /* fb_sz == 32 */);
+
+// split fields of char-delimited line of text
+// Achtung: delim = ' ' imitates awk: initial separators are skipped,
+// repeated seps treated as one, all chars less than ' ' treated as separators.
+int sf(char fs, char** fb, char* buf, size_t fb_sz = 32);
+
+// split fields of string-delimited line of text (fs is NOT a regexp)
+// (usually fs is "@@")
+int sf(const char* fs, char** fb, char* buf, size_t fb_sz = 32);
+
+// split fields of char-delimited line of text, set of char-separators is given
+// Achtung: repeated seps treated as one, initial seps are skipped
+// newlines are NOT ignored.
+int sf(const char* seps, char* buf, char** fb, size_t fb_sz = 32);
+
+inline char* chomp(char* buf) {
+ char* c = buf + strlen(buf);
+ if (c > buf && c[-1] == '\n') {
+ *--c = 0;
+#ifdef _win32_
+ if (c > buf && c[-1] == '\r')
+ *--c = 0;
+#endif
+ }
+ return buf;
+}
+
+inline char* chomp_cr(char* buf) {
+ char* c = buf + strlen(buf);
+ if (c > buf && c[-1] == '\n')
+ *--c = 0;
+ if (c > buf && c[-1] == '\r')
+ *--c = 0;
+ return buf;
+}
+
+class TLineSplitter {
+protected:
+ enum { // Default: Split string by SepStr
+ SplitByAnySep = 1, // Split string by Sep
+ NoEmptyFields = 2 // Skip all empty fields between separators
+ };
+
+private:
+ ui32 Flags;
+ const str_spn Sep; // collection of separators
+ const char* SepStr; // pointer exact string to separate by
+ size_t SepStrLen; // length of separator string
+
+public:
+ TLineSplitter(const char* sep, bool noEmpty)
+ : Flags(noEmpty ? NoEmptyFields : 0)
+ , Sep(TString(sep, 1).data())
+ , SepStr(sep)
+ , SepStrLen(strlen(sep))
+ {
+ }
+ TLineSplitter(const str_spn& sep, bool noEmpty = false)
+ : Flags(SplitByAnySep | (noEmpty ? NoEmptyFields : 0))
+ , Sep(sep)
+ , SepStr(nullptr)
+ , SepStrLen(1)
+ {
+ }
+ bool AnySep() const {
+ return Flags & SplitByAnySep;
+ }
+ bool SkipEmpty() const {
+ return Flags & NoEmptyFields;
+ }
+ /// Separates string onto tokens
+ /// Expecting a zero-terminated string
+ /// By default returns empty fields between sequential separators
+ void operator()(char* p, TVector<char*>& fields) const;
+ /// Same, but for const string - fills vector of pairs (pointer, length)
+ void operator()(const char* p, TVector<std::pair<const char*, size_t>>& fields) const;
+};
+
+/**
+ * Use library/cpp/map_text_file/map_tsv_file.h instead.
+ */
+class TSFReader {
+ TString Buf; // buffer used for non-'\n'-terminated string and for non-freebsd work
+ TLineSplitter Split;
+ TVector<char*> Fields;
+ size_t NF; // Fields.size()
+ size_t NR;
+
+ TFILEPtr File;
+
+ bool OpenPipe; // internal flag that turns open() to popen()
+
+ i32 FieldsRequired; // if != -1, != nf, terminate program
+
+public:
+ // char separator
+ // Achtung: delim = ' ' imitates awk: initial separators are skipped,
+ // all chars less than ' ' treated as separators.
+ TSFReader(const char* fname = nullptr, char sep = '\t', i32 nf_reqired = -1);
+ // exact string separator
+ TSFReader(const char* fname, const char* sep, i32 nf_reqired = -1);
+ // fully customizable
+ TSFReader(const char* fname, const TLineSplitter& spl, i32 nf_reqired = -1);
+
+ void Open(const char* fname, i32 nf_reqired = -1, size_t vbufsize = 1u << 21); // use "/dev/stdin" for stdin
+ void Popen(const char* pname, i32 nf_reqired = -1, size_t vbufsize = 1u << 21);
+
+ bool NextLine(segmented_string_pool* pool = nullptr);
+
+ bool IsOpen() const {
+ return (FILE*)File != nullptr;
+ }
+ bool IsEof() const {
+ return feof(File);
+ }
+ void Close() {
+ File.close();
+ }
+ void Rewind() {
+ File.seek(0, SEEK_SET);
+ }
+ void Seek(i64 offset, int mode = SEEK_SET) {
+ File.seek(offset, mode);
+ }
+ i64 Tell() const {
+ return ftell(File);
+ }
+ char*& operator[](size_t ind) {
+ //if (ind >= NF)
+ // throw yexception("Can't return reference to unexisting field %" PRISZT, ind);
+ return Fields[ind];
+ }
+ const char* operator[](size_t ind) const {
+ if (ind >= NF)
+ return nullptr;
+ return Fields[ind];
+ }
+ operator int() const { // note: empty input line makes 0 fields
+ return (int)NF;
+ }
+ const char* Name() const {
+ return File.name().data();
+ }
+ size_t Line() const {
+ return NR;
+ }
+ const TVector<char*>& GetFields() const {
+ return Fields;
+ }
+};
+
+struct prnstr {
+ char* buf;
+ int pos;
+ int asize;
+ prnstr()
+ : pos(0)
+ {
+ asize = 32;
+ buf = new char[asize];
+ }
+ explicit prnstr(int asz)
+ : pos(0)
+ {
+ asize = asz;
+ buf = new char[asize];
+ }
+ int f(const char* c, ...);
+ int s(const char* c1, const char* c2);
+ int s(const char* c1, const char* c2, const char* c3);
+ int s(const char* c, size_t len);
+ //int s(const char *c);
+ int s(const char* c) {
+ return c ? s(c, strlen(c)) : 0;
+ }
+ int s(const TString& c);
+ int s_htmesc(const char* c, bool enc_utf = false);
+ int s_htmesc_w(const char* c);
+ int c(char c);
+ int cu(wchar32 c); //for utf-8
+ void restart() {
+ *buf = 0;
+ pos = 0;
+ }
+ const char* operator~() const {
+ return buf;
+ }
+ int operator+() const {
+ return pos;
+ }
+ ~prnstr() {
+ delete[] buf;
+ }
+ void clear();
+ void swap(prnstr& w);
+};
+
+// functions that terminate program upon failure
+FILE* read_or_die(const char* fname);
+FILE* write_or_die(const char* fname);
+FILE* fopen_or_die(const char* fname, const char* mode);
+
+// functions that throw upon failure
+FILE* fopen_chk(const char* fname, const char* mode);
+void fclose_chk(FILE* f, const char* fname_dbg);
diff --git a/library/cpp/deprecated/fgood/fgood.cpp b/library/cpp/deprecated/fgood/fgood.cpp
new file mode 100644
index 0000000000..5d4725bfae
--- /dev/null
+++ b/library/cpp/deprecated/fgood/fgood.cpp
@@ -0,0 +1,70 @@
+#include "fgood.h"
+
+#include <util/generic/cast.h>
+#include <util/string/cast.h>
+#include <util/system/fstat.h>
+
+#ifdef _win32_
+#include <io.h>
+#endif
+
+i64 TFILEPtr::length() const {
+#ifdef _win32_
+ FHANDLE fd = (FHANDLE)_get_osfhandle(fileno(m_file));
+#else
+ FHANDLE fd = fileno(m_file);
+#endif
+ i64 rv = GetFileLength(fd);
+ if (rv < 0)
+ ythrow yexception() << "TFILEPtr::length() " << Name.data() << ": " << LastSystemErrorText();
+ return rv;
+}
+
+FILE* OpenFILEOrFail(const TString& name, const char* mode) {
+ FILE* res = ::fopen(name.data(), mode);
+ if (!res) {
+ ythrow yexception() << "can't open \'" << name << "\' with mode \'" << mode << "\': " << LastSystemErrorText();
+ }
+ return res;
+}
+
+void TFILECloser::Destroy(FILE* file) {
+ ::fclose(file);
+}
+
+#ifdef _freebsd_ // fgetln
+#define getline getline_alt_4test
+#endif // _freebsd_
+
+bool getline(TFILEPtr& f, TString& s) {
+ char buf[4096];
+ char* buf_ptr;
+ if (s.capacity() > sizeof(buf)) {
+ s.resize(s.capacity());
+ if ((buf_ptr = fgets(s.begin(), IntegerCast<int>(s.capacity()), f)) == nullptr)
+ return false;
+ } else {
+ if ((buf_ptr = fgets(buf, sizeof(buf), f)) == nullptr)
+ return false;
+ }
+ size_t buf_len = strlen(buf_ptr);
+ bool line_complete = buf_len && buf_ptr[buf_len - 1] == '\n';
+ if (line_complete)
+ buf_len--;
+ if (buf_ptr == s.begin())
+ s.resize(buf_len);
+ else
+ s.AssignNoAlias(buf, buf_len);
+ if (line_complete)
+ return true;
+ while (fgets(buf, sizeof(buf), f)) {
+ size_t buf_len2 = strlen(buf);
+ if (buf_len2 && buf[buf_len2 - 1] == '\n') {
+ buf[buf_len2 - 1] = 0;
+ s.append(buf, buf_len2 - 1);
+ return true;
+ }
+ s.append(buf, buf_len2);
+ }
+ return true;
+}
diff --git a/library/cpp/deprecated/fgood/fgood.h b/library/cpp/deprecated/fgood/fgood.h
new file mode 100644
index 0000000000..0aaf910c0f
--- /dev/null
+++ b/library/cpp/deprecated/fgood/fgood.h
@@ -0,0 +1,328 @@
+#pragma once
+
+#include <util/system/yassert.h>
+#include <util/system/defaults.h>
+#include <util/generic/string.h>
+#include <util/generic/yexception.h>
+#include <util/generic/ptr.h>
+
+#include "fput.h"
+
+#include <cstdio>
+
+#include <fcntl.h>
+
+#ifdef _unix_
+extern "C" int __ungetc(int, FILE*);
+#endif
+
+#if (!defined(__FreeBSD__) && !defined(__linux__) && !defined(_darwin_) && !defined(_cygwin_)) || defined(_bionic_)
+#define feof_unlocked(_stream) feof(_stream)
+#define ferror_unlocked(_stream) ferror(_stream)
+#endif
+
+#ifndef _unix_
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#define getc_unlocked(_stream) (--(_stream)->_cnt >= 0 ? 0xff & *(_stream)->_ptr++ : _filbuf(_stream))
+#define putc_unlocked(_c, _stream) (--(_stream)->_cnt >= 0 ? 0xff & (*(_stream)->_ptr++ = (char)(_c)) : _flsbuf((_c), (_stream)))
+#else
+#define getc_unlocked(_stream) getc(_stream)
+#define putc_unlocked(_c, _stream) putc(_c, _stream)
+#endif
+#endif
+
+inline bool fgood(FILE* f) {
+ return !feof_unlocked(f) && !ferror_unlocked(f);
+}
+
+#ifdef _win32_
+// These functions will work only with static MSVC runtime linkage. For dynamic linkage,
+// fseeki64.c and ftelli64.c from CRT sources should be included in project
+extern "C" int __cdecl _fseeki64(FILE*, __int64, int);
+extern "C" __int64 __cdecl _ftelli64(FILE*);
+
+inline i64 ftello(FILE* stream) {
+ return _ftelli64(stream);
+}
+
+inline int fseeko(FILE* stream, i64 offset, int origin) {
+ return _fseeki64(stream, offset, origin);
+}
+#endif
+
+class TFILEPtr {
+private:
+ enum { SHOULD_CLOSE = 1,
+ IS_PIPE = 2 };
+ FILE* m_file;
+ int m_Flags;
+ TString Name;
+
+public:
+ TFILEPtr() noexcept {
+ m_file = nullptr;
+ m_Flags = 0;
+ }
+ TFILEPtr(const TString& name, const char* mode) {
+ m_file = nullptr;
+ m_Flags = 0;
+ open(name, mode);
+ }
+ TFILEPtr(const TFILEPtr& src) noexcept {
+ m_file = src.m_file;
+ m_Flags = 0;
+ }
+ TFILEPtr& operator=(const TFILEPtr& src) {
+ if (src.m_file != m_file) {
+ close();
+ m_file = src.m_file;
+ m_Flags = 0;
+ }
+ return *this;
+ }
+ explicit TFILEPtr(FILE* f) noexcept { // take ownership
+ m_file = f;
+ m_Flags = SHOULD_CLOSE;
+ }
+ TFILEPtr& operator=(FILE* f) { // take ownership
+ if (f != m_file) {
+ close();
+ m_file = f;
+ m_Flags = SHOULD_CLOSE;
+ }
+ return *this;
+ }
+ const TString& name() const {
+ return Name;
+ }
+ operator FILE*() const noexcept {
+ return m_file;
+ }
+ FILE* operator->() const noexcept {
+ return m_file;
+ }
+ bool operator!() const noexcept {
+ return m_file == nullptr;
+ }
+ bool operator!=(FILE* f) const noexcept {
+ return m_file != f;
+ }
+ bool operator==(FILE* f) const noexcept {
+ return m_file == f;
+ }
+ ~TFILEPtr() {
+ close();
+ }
+ void Y_PRINTF_FORMAT(2, 3) check(const char* message, ...) const {
+ if (Y_UNLIKELY(!fgood(m_file))) {
+ va_list args;
+ va_start(args, message);
+ char buf[512];
+ vsnprintf(buf, 512, message, args);
+ // XXX: errno is undefined here
+ ythrow yexception() << buf << ": " << LastSystemErrorText() << ", " << Name.data() << " at offset " << (i64)ftell();
+ }
+ }
+ TFILEPtr& assign(FILE* f, const char* name = nullptr) { // take ownership and have a name
+ *this = f;
+ if (name)
+ Name = name;
+ return *this;
+ }
+ void open(const TString& name, const char* mode) {
+ Y_ASSERT(!name.empty());
+ Y_ASSERT(m_file == nullptr);
+ m_file = ::fopen(name.data(), mode);
+ if (!m_file)
+ ythrow yexception() << "can't open \'" << name << "\' with mode \'" << mode << "\': " << LastSystemErrorText();
+ m_Flags = SHOULD_CLOSE;
+ Name = name;
+ }
+ void popen(const TString& command, const char* mode) {
+ Y_ASSERT(!command.empty());
+ Y_ASSERT(m_file == nullptr);
+ m_file = ::popen(command.data(), mode);
+ if (!m_file)
+ ythrow yexception() << "can't execute \'" << command << "\' with mode \'" << mode << "\': " << LastSystemErrorText();
+ m_Flags = IS_PIPE | SHOULD_CLOSE;
+ Name = command;
+ }
+ void close() {
+ if (m_file != nullptr && (m_Flags & SHOULD_CLOSE)) {
+ if ((m_Flags & IS_PIPE) ? ::pclose(m_file) : ::fclose(m_file)) {
+ m_file = nullptr;
+ m_Flags = 0;
+ if (!UncaughtException())
+ ythrow yexception() << "can't close file " << Name.data() << ": " << LastSystemErrorText();
+ }
+ }
+ m_file = nullptr;
+ m_Flags = 0;
+ Name.clear();
+ }
+ size_t write(const void* buffer, size_t size, size_t count) const {
+ Y_ASSERT(m_file != nullptr);
+ size_t r = ::fwrite(buffer, size, count, m_file);
+ check("can't write %lu bytes", (unsigned long)size * count);
+ return r;
+ }
+ size_t read(void* buffer, size_t size, size_t count) const {
+ Y_ASSERT(m_file != nullptr);
+ size_t r = ::fread(buffer, size, count, m_file);
+ if (ferror_unlocked(m_file))
+ ythrow yexception() << "can't read " << (unsigned long)size * count << " bytes: " << LastSystemErrorText() << ", " << Name.data() << " at offset " << (i64)ftell();
+ return r;
+ }
+ char* fgets(char* buffer, int size) const {
+ Y_ASSERT(m_file != nullptr);
+ char* r = ::fgets(buffer, size, m_file);
+ if (ferror_unlocked(m_file))
+ ythrow yexception() << "can't read string of maximum size " << size << ": " << LastSystemErrorText() << ", " << Name.data() << " at offset " << (i64)ftell();
+ return r;
+ }
+ void Y_PRINTF_FORMAT(2, 3) fprintf(const char* format, ...) {
+ Y_ASSERT(m_file != nullptr);
+ va_list args;
+ va_start(args, format);
+ vfprintf(m_file, format, args);
+ check("can't write");
+ }
+ void seek(i64 offset, int origin) const {
+ Y_ASSERT(m_file != nullptr);
+#if defined(_unix_) || defined(_win32_)
+ if (fseeko(m_file, offset, origin) != 0)
+#else
+ Y_ASSERT(offset == (i64)(i32)offset);
+ if (::fseek(m_file, (long)offset, origin) != 0)
+#endif
+ ythrow yexception() << "can't seek " << Name.data() << " by " << offset << ": " << LastSystemErrorText();
+ }
+ i64 length() const; // uses various system headers -> in fileptr.cpp
+
+ void setDirect() const {
+#if !defined(_win_) && !defined(_darwin_)
+ if (!m_file)
+ ythrow yexception() << "file not open";
+ if (fcntl(fileno(m_file), F_SETFL, O_DIRECT) == -1)
+ ythrow yexception() << "Cannot set O_DIRECT flag";
+#endif
+ }
+
+ // for convenience
+
+ i64 ftell() const noexcept {
+#if defined(_unix_) || defined(_win32_)
+ return ftello(m_file);
+#else
+ return ftell(m_file);
+#endif
+ }
+ bool eof() const noexcept {
+ Y_ASSERT(m_file != nullptr);
+ return feof_unlocked(m_file) != 0;
+ }
+ int fputc(int c) {
+ Y_ASSERT(m_file != nullptr);
+ return putc_unlocked(c, m_file);
+ }
+ size_t fputs(const char* buffer) const {
+ return write(buffer, strlen(buffer), 1);
+ }
+ int fgetc() {
+ Y_ASSERT(m_file != nullptr);
+ return getc_unlocked(m_file);
+ }
+ int ungetc(int c) {
+ Y_ASSERT(m_file != nullptr);
+ return ::ungetc(c, m_file);
+ }
+ template <class T>
+ size_t fput(const T& a) {
+ Y_ASSERT(m_file != nullptr);
+ return ::fput(m_file, a);
+ }
+ template <class T>
+ size_t fget(T& a) {
+ Y_ASSERT(m_file != nullptr);
+ return ::fget(m_file, a);
+ }
+ size_t fsput(const char* s, size_t l) {
+ Y_ASSERT(m_file != nullptr);
+ return ::fsput(m_file, s, l);
+ }
+ size_t fsget(char* s, size_t l) {
+ Y_ASSERT(m_file != nullptr);
+ return ::fsget(m_file, s, l);
+ }
+
+ void fflush() {
+ ::fflush(m_file);
+ }
+
+ /* This block contains some TFile/TStream - compatible names */
+ size_t Read(void* bufferIn, size_t numBytes) {
+ size_t r = fsget((char*)bufferIn, numBytes);
+ if (Y_UNLIKELY(ferror_unlocked(m_file)))
+ ythrow yexception() << "can't read " << numBytes << " bytes: " << LastSystemErrorText() << ", " << Name << " at offset " << (i64)ftell();
+ return r;
+ }
+ void Write(const void* buffer, size_t numBytes) {
+ write(buffer, 1, numBytes);
+ }
+ i64 Seek(i64 offset, int origin /*SeekDir*/) {
+ seek(offset, origin);
+ return ftell();
+ }
+ i64 GetPosition() const noexcept {
+ return ftell();
+ }
+ i64 GetLength() const noexcept {
+ return length();
+ }
+ bool ReadLine(TString& st);
+
+ /* Similar to TAutoPtr::Release - return pointer and forget about it. */
+ FILE* Release() noexcept {
+ FILE* result = m_file;
+ m_file = nullptr;
+ m_Flags = 0;
+ Name.clear();
+ return result;
+ }
+};
+
+inline void fclose(TFILEPtr& F) {
+ F.close();
+}
+
+inline void fseek(const TFILEPtr& F, i64 offset, int whence) {
+ F.seek(offset, whence);
+}
+
+#ifdef _freebsd_ // fgetln
+inline bool getline(TFILEPtr& f, TString& s) {
+ size_t len;
+ char* buf = fgetln(f, &len);
+ if (!buf)
+ return false;
+ if (len && buf[len - 1] == '\n')
+ len--;
+ s.AssignNoAlias(buf, len);
+ return true;
+}
+#else
+bool getline(TFILEPtr& f, TString& s);
+#endif //_freebsd_
+
+inline bool TFILEPtr::ReadLine(TString& st) {
+ return getline(*this, st);
+}
+
+FILE* OpenFILEOrFail(const TString& name, const char* mode);
+
+//Should be used with THolder
+struct TFILECloser {
+ static void Destroy(FILE* file);
+};
+
+using TFILEHolder = THolder<FILE, TFILECloser>;
diff --git a/library/cpp/deprecated/fgood/fput.h b/library/cpp/deprecated/fgood/fput.h
new file mode 100644
index 0000000000..690b06332d
--- /dev/null
+++ b/library/cpp/deprecated/fgood/fput.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <util/system/defaults.h>
+#include <util/system/valgrind.h>
+
+#include <cstdio>
+
+#ifdef __FreeBSD__
+#include <cstring>
+
+template <class T>
+Y_FORCE_INLINE size_t fput(FILE* F, const T& a) {
+ if (Y_LIKELY(F->_w >= int(sizeof(a)))) {
+ memcpy(F->_p, &a, sizeof(a));
+ F->_p += sizeof(a);
+ F->_w -= sizeof(a);
+ return 1;
+ } else {
+ return fwrite(&a, sizeof(a), 1, F);
+ }
+}
+
+template <class T>
+Y_FORCE_INLINE size_t fget(FILE* F, T& a) {
+ if (Y_LIKELY(F->_r >= int(sizeof(a)))) {
+ memcpy(&a, F->_p, sizeof(a));
+ F->_p += sizeof(a);
+ F->_r -= sizeof(a);
+ return 1;
+ } else {
+ return fread(&a, sizeof(a), 1, F);
+ }
+}
+
+inline size_t fsput(FILE* F, const char* s, size_t l) {
+ VALGRIND_CHECK_READABLE(s, l);
+
+ if ((size_t)F->_w >= l) {
+ memcpy(F->_p, s, l);
+ F->_p += l;
+ F->_w -= l;
+ return l;
+ } else {
+ return fwrite(s, 1, l, F);
+ }
+}
+
+inline size_t fsget(FILE* F, char* s, size_t l) {
+ if ((size_t)F->_r >= l) {
+ memcpy(s, F->_p, l);
+ F->_p += l;
+ F->_r -= l;
+ return l;
+ } else {
+ return fread(s, 1, l, F);
+ }
+}
+#else
+template <class T>
+Y_FORCE_INLINE size_t fput(FILE* F, const T& a) {
+ return fwrite(&a, sizeof(a), 1, F);
+}
+
+template <class T>
+Y_FORCE_INLINE size_t fget(FILE* F, T& a) {
+ return fread(&a, sizeof(a), 1, F);
+}
+
+inline size_t fsput(FILE* F, const char* s, size_t l) {
+#ifdef WITH_VALGRIND
+ VALGRIND_CHECK_READABLE(s, l);
+#endif
+ return fwrite(s, 1, l, F);
+}
+
+inline size_t fsget(FILE* F, char* s, size_t l) {
+ return fread(s, 1, l, F);
+}
+#endif
diff --git a/library/cpp/deprecated/fgood/ya.make b/library/cpp/deprecated/fgood/ya.make
new file mode 100644
index 0000000000..2394f9ad7a
--- /dev/null
+++ b/library/cpp/deprecated/fgood/ya.make
@@ -0,0 +1,8 @@
+LIBRARY()
+
+SRCS(
+ ffb.cpp
+ fgood.cpp
+)
+
+END()
diff --git a/library/cpp/deprecated/mapped_file/CMakeLists.darwin-x86_64.txt b/library/cpp/deprecated/mapped_file/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..a00407491d
--- /dev/null
+++ b/library/cpp/deprecated/mapped_file/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-mapped_file)
+target_link_libraries(cpp-deprecated-mapped_file PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-mapped_file PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/mapped_file/mapped_file.cpp
+)
diff --git a/library/cpp/deprecated/mapped_file/CMakeLists.linux-aarch64.txt b/library/cpp/deprecated/mapped_file/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..2bb5db017b
--- /dev/null
+++ b/library/cpp/deprecated/mapped_file/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,18 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-mapped_file)
+target_link_libraries(cpp-deprecated-mapped_file PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-mapped_file PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/mapped_file/mapped_file.cpp
+)
diff --git a/library/cpp/deprecated/mapped_file/CMakeLists.linux-x86_64.txt b/library/cpp/deprecated/mapped_file/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..2bb5db017b
--- /dev/null
+++ b/library/cpp/deprecated/mapped_file/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,18 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-mapped_file)
+target_link_libraries(cpp-deprecated-mapped_file PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-mapped_file PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/mapped_file/mapped_file.cpp
+)
diff --git a/library/cpp/deprecated/mapped_file/CMakeLists.txt b/library/cpp/deprecated/mapped_file/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/deprecated/mapped_file/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/deprecated/mapped_file/CMakeLists.windows-x86_64.txt b/library/cpp/deprecated/mapped_file/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..a00407491d
--- /dev/null
+++ b/library/cpp/deprecated/mapped_file/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-deprecated-mapped_file)
+target_link_libraries(cpp-deprecated-mapped_file PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-deprecated-mapped_file PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/deprecated/mapped_file/mapped_file.cpp
+)
diff --git a/library/cpp/deprecated/mapped_file/mapped_file.cpp b/library/cpp/deprecated/mapped_file/mapped_file.cpp
new file mode 100644
index 0000000000..b0e4511299
--- /dev/null
+++ b/library/cpp/deprecated/mapped_file/mapped_file.cpp
@@ -0,0 +1,64 @@
+#include "mapped_file.h"
+
+#include <util/generic/yexception.h>
+#include <util/system/defaults.h>
+#include <util/system/hi_lo.h>
+#include <util/system/filemap.h>
+
+TMappedFile::TMappedFile(TFileMap* map, const char* dbgName) {
+ Map_ = map;
+ i64 len = Map_->Length();
+ if (Hi32(len) != 0 && sizeof(size_t) <= sizeof(ui32))
+ ythrow yexception() << "File '" << dbgName << "' mapping error: " << len << " too large";
+
+ Map_->Map(0, static_cast<size_t>(len));
+}
+
+TMappedFile::TMappedFile(const TFile& file, TFileMap::EOpenMode om, const char* dbgName)
+ : Map_(nullptr)
+{
+ init(file, om, dbgName);
+}
+
+void TMappedFile::precharge(size_t off, size_t size) const {
+ if (!Map_)
+ return;
+
+ Map_->Precharge(off, size);
+}
+
+void TMappedFile::init(const TString& name) {
+ THolder<TFileMap> map(new TFileMap(name));
+ TMappedFile newFile(map.Get(), name.data());
+ Y_UNUSED(map.Release());
+ newFile.swap(*this);
+ newFile.term();
+}
+
+void TMappedFile::init(const TString& name, size_t length, TFileMap::EOpenMode om) {
+ THolder<TFileMap> map(new TFileMap(name, length, om));
+ TMappedFile newFile(map.Get(), name.data());
+ Y_UNUSED(map.Release());
+ newFile.swap(*this);
+ newFile.term();
+}
+
+void TMappedFile::init(const TFile& file, TFileMap::EOpenMode om, const char* dbgName) {
+ THolder<TFileMap> map(new TFileMap(file, om));
+ TMappedFile newFile(map.Get(), dbgName);
+ Y_UNUSED(map.Release());
+ newFile.swap(*this);
+ newFile.term();
+}
+
+void TMappedFile::init(const TString& name, TFileMap::EOpenMode om) {
+ THolder<TFileMap> map(new TFileMap(name, om));
+ TMappedFile newFile(map.Get(), name.data());
+ Y_UNUSED(map.Release());
+ newFile.swap(*this);
+ newFile.term();
+}
+
+void TMappedFile::flush() {
+ Map_->Flush();
+}
diff --git a/library/cpp/deprecated/mapped_file/ya.make b/library/cpp/deprecated/mapped_file/ya.make
new file mode 100644
index 0000000000..309341f1da
--- /dev/null
+++ b/library/cpp/deprecated/mapped_file/ya.make
@@ -0,0 +1,7 @@
+LIBRARY()
+
+SRCS(
+ mapped_file.cpp
+)
+
+END()
diff --git a/library/cpp/geo/CMakeLists.darwin-x86_64.txt b/library/cpp/geo/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..87e48b4a71
--- /dev/null
+++ b/library/cpp/geo/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,24 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-geo)
+target_link_libraries(library-cpp-geo PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(library-cpp-geo PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/bbox.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/geo.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/point.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/polygon.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/load_save_helper.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/size.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/util.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/window.cpp
+)
diff --git a/library/cpp/geo/CMakeLists.linux-aarch64.txt b/library/cpp/geo/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..cdad35989a
--- /dev/null
+++ b/library/cpp/geo/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,25 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-geo)
+target_link_libraries(library-cpp-geo PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(library-cpp-geo PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/bbox.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/geo.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/point.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/polygon.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/load_save_helper.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/size.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/util.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/window.cpp
+)
diff --git a/library/cpp/geo/CMakeLists.linux-x86_64.txt b/library/cpp/geo/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..cdad35989a
--- /dev/null
+++ b/library/cpp/geo/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,25 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-geo)
+target_link_libraries(library-cpp-geo PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(library-cpp-geo PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/bbox.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/geo.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/point.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/polygon.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/load_save_helper.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/size.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/util.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/window.cpp
+)
diff --git a/library/cpp/geo/CMakeLists.txt b/library/cpp/geo/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/geo/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/geo/CMakeLists.windows-x86_64.txt b/library/cpp/geo/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..87e48b4a71
--- /dev/null
+++ b/library/cpp/geo/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,24 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-geo)
+target_link_libraries(library-cpp-geo PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(library-cpp-geo PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/bbox.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/geo.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/point.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/polygon.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/load_save_helper.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/size.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/util.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/geo/window.cpp
+)
diff --git a/library/cpp/geo/bbox.cpp b/library/cpp/geo/bbox.cpp
new file mode 100644
index 0000000000..aa4258ac22
--- /dev/null
+++ b/library/cpp/geo/bbox.cpp
@@ -0,0 +1 @@
+#include "bbox.h"
diff --git a/library/cpp/geo/bbox.h b/library/cpp/geo/bbox.h
new file mode 100644
index 0000000000..7ec7e6f7d6
--- /dev/null
+++ b/library/cpp/geo/bbox.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <util/generic/utility.h>
+
+#include "point.h"
+
+namespace NGeo {
+
+ class TGeoBoundingBox {
+ public:
+ TGeoBoundingBox()
+
+ = default;
+
+ TGeoBoundingBox(const TGeoPoint& p1, const TGeoPoint& p2) {
+ MinX_ = Min(p1.Lon(), p2.Lon());
+ MaxX_ = Max(p1.Lon(), p2.Lon());
+ MinY_ = Min(p1.Lat(), p2.Lat());
+ MaxY_ = Max(p1.Lat(), p2.Lat());
+ }
+
+ const double& GetMinX() const {
+ return MinX_;
+ }
+
+ const double& GetMaxX() const {
+ return MaxX_;
+ }
+
+ const double& GetMinY() const {
+ return MinY_;
+ }
+
+ const double& GetMaxY() const {
+ return MaxY_;
+ }
+
+ double Width() const {
+ return MaxX_ - MinX_;
+ }
+
+ double Height() const {
+ return MaxY_ - MinY_;
+ }
+
+ private:
+ double MinX_{std::numeric_limits<double>::quiet_NaN()};
+ double MaxX_{std::numeric_limits<double>::quiet_NaN()};
+ double MinY_{std::numeric_limits<double>::quiet_NaN()};
+ double MaxY_{std::numeric_limits<double>::quiet_NaN()};
+ };
+
+ inline bool operator==(const TGeoBoundingBox& a, const TGeoBoundingBox& b) {
+ return a.GetMinX() == b.GetMinX() &&
+ a.GetMinY() == b.GetMinY() &&
+ a.GetMaxX() == b.GetMaxX() &&
+ a.GetMaxY() == b.GetMaxY();
+ }
+} // namespace NGeo
diff --git a/library/cpp/geo/geo.cpp b/library/cpp/geo/geo.cpp
new file mode 100644
index 0000000000..37adc5c62c
--- /dev/null
+++ b/library/cpp/geo/geo.cpp
@@ -0,0 +1 @@
+#include "geo.h"
diff --git a/library/cpp/geo/geo.h b/library/cpp/geo/geo.h
new file mode 100644
index 0000000000..1aebacab5c
--- /dev/null
+++ b/library/cpp/geo/geo.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "bbox.h"
+#include "point.h"
+#include "polygon.h"
+#include "size.h"
+#include "util.h"
+#include "window.h"
diff --git a/library/cpp/geo/load_save_helper.cpp b/library/cpp/geo/load_save_helper.cpp
new file mode 100644
index 0000000000..13fa7ac6df
--- /dev/null
+++ b/library/cpp/geo/load_save_helper.cpp
@@ -0,0 +1,49 @@
+#include "load_save_helper.h"
+#include <util/stream/input.h>
+
+void TSerializer<NGeo::TGeoPoint>::Save(IOutputStream* out, const NGeo::TGeoPoint& point) {
+ double lon = static_cast<double>(point.Lon());
+ double lat = static_cast<double>(point.Lat());
+ ::Save(out, lon);
+ ::Save(out, lat);
+}
+
+void TSerializer<NGeo::TGeoPoint>::Load(IInputStream* in, NGeo::TGeoPoint& point) {
+ double lon = std::numeric_limits<double>::quiet_NaN();
+ double lat = std::numeric_limits<double>::quiet_NaN();
+ ::Load(in, lon);
+ ::Load(in, lat);
+ point = {lon, lat};
+}
+
+void TSerializer<NGeo::TGeoWindow>::Save(IOutputStream* out, const NGeo::TGeoWindow& window) {
+ const auto& center = window.GetCenter();
+ const auto& size = window.GetSize();
+ ::Save(out, center);
+ ::Save(out, size);
+}
+
+void TSerializer<NGeo::TGeoWindow>::Load(IInputStream* in, NGeo::TGeoWindow& window) {
+ NGeo::TSize size{};
+ NGeo::TGeoPoint center{};
+
+ ::Load(in, center);
+ ::Load(in, size);
+
+ window = {center, size};
+}
+
+void TSerializer<NGeo::TSize>::Save(IOutputStream* out, const NGeo::TSize& size) {
+ double width = static_cast<double>(size.GetWidth());
+ double height = static_cast<double>(size.GetHeight());
+ ::Save(out, width);
+ ::Save(out, height);
+}
+
+void TSerializer<NGeo::TSize>::Load(IInputStream* in, NGeo::TSize& size) {
+ double width = std::numeric_limits<double>::quiet_NaN();
+ double height = std::numeric_limits<double>::quiet_NaN();
+ ::Load(in, width);
+ ::Load(in, height);
+ size = {width, height};
+}
diff --git a/library/cpp/geo/load_save_helper.h b/library/cpp/geo/load_save_helper.h
new file mode 100644
index 0000000000..4a5fceea18
--- /dev/null
+++ b/library/cpp/geo/load_save_helper.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <library/cpp/geo/window.h>
+#include <util/stream/input.h>
+#include <util/ysaveload.h>
+
+template <>
+struct TSerializer<NGeo::TGeoPoint> {
+ static void Save(IOutputStream*, const NGeo::TGeoPoint&);
+ static void Load(IInputStream*, NGeo::TGeoPoint&);
+};
+
+template <>
+struct TSerializer<NGeo::TGeoWindow> {
+ static void Save(IOutputStream*, const NGeo::TGeoWindow&);
+ static void Load(IInputStream*, NGeo::TGeoWindow&);
+};
+
+template <>
+struct TSerializer<NGeo::TSize> {
+ static void Save(IOutputStream*, const NGeo::TSize&);
+ static void Load(IInputStream*, NGeo::TSize&);
+};
diff --git a/library/cpp/geo/point.cpp b/library/cpp/geo/point.cpp
new file mode 100644
index 0000000000..1d227c967f
--- /dev/null
+++ b/library/cpp/geo/point.cpp
@@ -0,0 +1,146 @@
+#include "point.h"
+#include "util.h"
+
+#include <util/generic/ylimits.h>
+#include <util/generic/ymath.h>
+
+#include <cstdlib>
+#include <utility>
+
+namespace NGeo {
+ namespace {
+ bool IsNonDegeneratePoint(double lon, double lat) {
+ return (MIN_LONGITUDE - WORLD_WIDTH < lon && lon < MAX_LONGITUDE + WORLD_WIDTH) &&
+ (MIN_LATITUDE < lat && lat < MAX_LATITUDE);
+ }
+ } // namespace
+
+ float TGeoPoint::Distance(const TGeoPoint& p) const noexcept {
+ auto dp = p - (*this);
+ return sqrtf(Sqr(GetWidthAtEquator(dp.GetWidth(), (Lat_ + p.Lat()) * 0.5)) + Sqr(dp.GetHeight()));
+ }
+
+ bool TGeoPoint::IsPole() const noexcept {
+ return Lat_ <= MIN_LATITUDE || MAX_LATITUDE <= Lat_;
+ }
+
+ bool TGeoPoint::IsVisibleOnMap() const noexcept {
+ return -VISIBLE_LATITUDE_BOUND <= Lat_ && Lat_ <= VISIBLE_LATITUDE_BOUND;
+ }
+
+ TGeoPoint TGeoPoint::Parse(TStringBuf s, TStringBuf delimiter) {
+ const auto& [lon, lat] = PairFromString(s, delimiter);
+ Y_ENSURE_EX(IsNonDegeneratePoint(lon, lat), TBadCastException() << "Invalid point: (" << lon << ", " << lat << ")");
+ return {lon, lat};
+ }
+
+ TMaybe<TGeoPoint> TGeoPoint::TryParse(TStringBuf s, TStringBuf delimiter) {
+ std::pair<double, double> lonLat;
+ if (!TryPairFromString(lonLat, s, delimiter)) {
+ return {};
+ }
+ if (!IsNonDegeneratePoint(lonLat.first, lonLat.second)) {
+ return {};
+ }
+ return TGeoPoint(lonLat.first, lonLat.second);
+ }
+
+ TSize operator-(const TGeoPoint& p1, const TGeoPoint& p2) {
+ return {p1.Lon() - p2.Lon(), p1.Lat() - p2.Lat()};
+ }
+
+ /*
+ Conversion code was imported from http://wiki.yandex-team.ru/YandexMobile/maps/Algorithm/mapengine/coordtransforms
+ */
+ namespace WGS84 {
+ /* Isometric to geodetic latitude parameters, default to WGS 84 */
+ const double ab = 0.00335655146887969400;
+ const double bb = 0.00000657187271079536;
+ const double cb = 0.00000001764564338702;
+ const double db = 0.00000000005328478445;
+
+ const double _a = R;
+ const double _f = 1.0 / 298.257223563;
+ const double _b = _a - _f * _a;
+ const double _e = sqrt(1 - pow(_b / _a, 2));
+ const double _e2 = _e * _e;
+ const double _g = sqrt(1.0 - _e2);
+ const double _gR2 = _g * R * 2.0;
+ } // namespace WGS84
+
+ TGeoPoint MercatorToLL(TMercatorPoint pt) {
+ using namespace WGS84;
+
+ // Y_ENSURE(pt.IsDefined(), "Point is not defined");
+
+ /* Isometric latitude*/
+ const double xphi = PI / 2.0 - 2.0 * atan(exp(-pt.Y_ / R));
+
+ double latitude = xphi + ab * sin(2.0 * xphi) + bb * sin(4.0 * xphi) + cb * sin(6.0 * xphi) + db * sin(8.0 * xphi);
+ double longitude = pt.X_ / R;
+
+ return TGeoPoint{Rad2deg(longitude), Rad2deg(latitude)};
+ }
+
+ double GetMercatorY(const TGeoPoint& ll) {
+ if (Y_UNLIKELY(ll.Lat() == 0.)) {
+ // shortcut for common case, avoiding floating point errors
+ return 0.;
+ }
+ if (Y_UNLIKELY(ll.Lat() == MIN_LATITUDE)) {
+ return -std::numeric_limits<double>::infinity();
+ }
+ if (Y_UNLIKELY(ll.Lat() == MAX_LATITUDE)) {
+ return +std::numeric_limits<double>::infinity();
+ }
+ double lat = Deg2rad(ll.Lat());
+ double esinLat = WGS84::_e * sin(lat);
+
+ double tan_temp = tan(PI / 4.e0 + lat / 2.e0);
+ double pow_temp = pow(tan(PI / 4.e0 + asin(esinLat) / 2), WGS84::_e);
+ double U = tan_temp / pow_temp;
+ return WGS84::R * log(U);
+ }
+
+ TMercatorPoint LLToMercator(TGeoPoint ll) {
+ // Y_ENSURE(ll.IsValid(), "Point is not defined");
+
+ // Y_ENSURE(-90. <= ll.Lat() && ll.Lat() <= +90., "Latitude is out of range [-90, 90]");
+
+ double lon = Deg2rad(ll.Lon());
+ double x = WGS84::R * lon;
+ double y = GetMercatorY(ll);
+
+ return TMercatorPoint{x, y};
+ }
+
+ double GeodeticDistance(TGeoPoint p1, TGeoPoint p2) {
+ using namespace WGS84;
+
+ constexpr double deg2HalfRad = PI / 360.0;
+
+ const double lon1Half = p1.Lon() * deg2HalfRad;
+ const double lon2Half = p2.Lon() * deg2HalfRad;
+
+ const double lat1Half = p1.Lat() * deg2HalfRad;
+ const double lat2Half = p2.Lat() * deg2HalfRad;
+
+ const double diffLatHalf = fabs(lat1Half - lat2Half);
+ const double diffLonHalf = fabs(lon1Half - lon2Half);
+
+ if (diffLatHalf < 0.5e-8 && diffLonHalf < 0.5e-8) {
+ return 0;
+ }
+
+ double s = sin(lat1Half + lat2Half);
+ double s2 = s * s;
+ double m = _gR2 / (1.0 - _e2 * s2);
+
+ const double w = sin(diffLatHalf);
+ const double w2 = w * w;
+ const double cc = Max(1.0 - s2 - w2, 0.0); // cos(lat1Half * 2) * cos(lat2Half * 2)
+ const double z = sin(diffLonHalf);
+
+ return m * asin(sqrt(w2 + cc * z * z));
+ }
+} // namespace NGeo
diff --git a/library/cpp/geo/point.h b/library/cpp/geo/point.h
new file mode 100644
index 0000000000..70c91ab2dd
--- /dev/null
+++ b/library/cpp/geo/point.h
@@ -0,0 +1,198 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/stream/output.h>
+#include <util/string/cast.h>
+#include <util/generic/maybe.h>
+
+#include <algorithm>
+#include <cmath>
+
+namespace NGeo {
+ class TSize;
+
+ class TGeoPoint {
+ public:
+ TGeoPoint(double lon, double lat) noexcept
+ : Lon_(lon)
+ , Lat_(lat)
+ {
+ }
+
+ TGeoPoint() noexcept
+ : Lon_(BadX)
+ , Lat_(BadY)
+ {
+ }
+
+ double Lon() const noexcept {
+ return Lon_;
+ }
+
+ double Lat() const noexcept {
+ return Lat_;
+ }
+
+ float Distance(const TGeoPoint& p) const noexcept;
+
+ void swap(TGeoPoint& p) noexcept {
+ std::swap(Lon_, p.Lon_);
+ std::swap(Lat_, p.Lat_);
+ }
+
+ bool IsValid() const {
+ return (Lon_ != BadX) && (Lat_ != BadY);
+ }
+
+ /// Returns true if the point represents either North or South Pole
+ bool IsPole() const noexcept;
+
+ /// Returns true if the point may be shown on the Yandex Map (fits into the valid range of latitudes)
+ bool IsVisibleOnMap() const noexcept;
+
+ bool operator!() const {
+ return !IsValid();
+ }
+
+ TString ToCgiStr() const {
+ return ToString();
+ }
+
+ TString ToString(const char* delimiter = ",") const {
+ return TString::Join(::ToString(Lon_), delimiter, ::ToString(Lat_));
+ }
+
+ /**
+ * \note Parsing functions work is safe way. They discard invalid points:
+ * 1) on the Poles and 'beyond' the Poles;
+ * 2) not belonging to the 'main' world and +/-1 world to the left or to the right.
+ * If you need such cases, construct the TGeoPoint manually.
+ */
+
+ /// Throws TBadCastException on error
+ static TGeoPoint Parse(TStringBuf s, TStringBuf delimiter = TStringBuf(","));
+
+ /// Returns Nothing() on error
+ static TMaybe<TGeoPoint> TryParse(TStringBuf s, TStringBuf delimiter = TStringBuf(","));
+
+ private:
+ double Lon_;
+ double Lat_;
+
+ static constexpr double BadX{361.};
+ static constexpr double BadY{181.};
+ };
+
+ double GeodeticDistance(TGeoPoint p1, TGeoPoint p2);
+
+ /**
+ * \class TMercatorPoint
+ *
+ * Represents a point in EPSG:3395 projection
+ * (WGS 84 / World Mercator)
+ */
+ class TMercatorPoint {
+ public:
+ friend class TMercatorWindow;
+ friend TGeoPoint MercatorToLL(TMercatorPoint);
+
+ /**
+ * Constructs a point with the given coordinates.
+ */
+ constexpr TMercatorPoint(double x, double y) noexcept
+ : X_{x}
+ , Y_{y}
+ {
+ }
+
+ /**
+ * Constructs a point with two NaN coordinates.
+ *
+ * Should not be called directly.
+ * If your `point` variable might be undefined,
+ * declare it explicitly as TMaybe<TMercatorPoint>.
+ */
+ constexpr TMercatorPoint() noexcept
+ : X_{std::numeric_limits<double>::quiet_NaN()}
+ , Y_{std::numeric_limits<double>::quiet_NaN()}
+ {
+ }
+
+ /**
+ * Returns the X_ coordinate.
+ *
+ * The line X_ == 0 corresponds to the Prime meridian.
+ */
+ constexpr double X() const noexcept {
+ return X_;
+ }
+
+ /**
+ * Returns the Y_ coordinate.
+ *
+ * The line Y_ == 0 corresponds to the Equator.
+ */
+ constexpr double Y() const noexcept {
+ return Y_;
+ }
+
+ private:
+ bool IsDefined() const noexcept {
+ return !std::isnan(X_) && !std::isnan(Y_);
+ }
+
+ private:
+ double X_;
+ double Y_;
+ };
+
+ /**
+ * Operators
+ */
+
+ inline bool operator==(const TGeoPoint& p1, const TGeoPoint& p2) {
+ return p1.Lon() == p2.Lon() && p1.Lat() == p2.Lat();
+ }
+
+ inline bool operator==(const TMercatorPoint& p1, const TMercatorPoint& p2) {
+ return p1.X() == p2.X() && p1.Y() == p2.Y();
+ }
+
+ inline bool operator<(const TGeoPoint& p1, const TGeoPoint& p2) {
+ if (p1.Lon() != p2.Lon()) {
+ return p1.Lon() < p2.Lon();
+ }
+ return p1.Lat() < p2.Lat();
+ }
+
+ /**
+ * Conversion
+ */
+
+ namespace WGS84 {
+ /* Radius of reference ellipsoid, default to WGS 84 */
+ const double R = 6378137.0;
+ } // namespace WGS84
+
+ using TPointLL = TGeoPoint;
+ using TPointXY = TMercatorPoint;
+
+ TGeoPoint MercatorToLL(TMercatorPoint);
+ TMercatorPoint LLToMercator(TGeoPoint);
+
+ /**
+ * Input/output
+ */
+
+ TSize operator-(const TGeoPoint& p1, const TGeoPoint& p2);
+} // namespace NGeo
+
+template <>
+inline void Out<NGeo::TGeoPoint>(IOutputStream& o, const NGeo::TGeoPoint& p) {
+ o << '[' << p.Lon() << ", " << p.Lat() << ']';
+}
+
+template <>
+inline void Out<NGeo::TMercatorPoint>(IOutputStream& o, const NGeo::TMercatorPoint& p) {
+ o << '[' << p.X() << ", " << p.Y() << ']';
+}
diff --git a/library/cpp/geo/polygon.cpp b/library/cpp/geo/polygon.cpp
new file mode 100644
index 0000000000..44e5c38b5f
--- /dev/null
+++ b/library/cpp/geo/polygon.cpp
@@ -0,0 +1,28 @@
+#include "polygon.h"
+namespace NGeo {
+ TMaybe<TGeoPolygon> TGeoPolygon::TryParse(TStringBuf s, TStringBuf llDelimiter, TStringBuf pointsDelimiter) {
+ TVector<TGeoPoint> points;
+
+ for (const auto& pointString : StringSplitter(s).SplitByString(pointsDelimiter).SkipEmpty()) {
+ auto curPoint = TGeoPoint::TryParse(pointString.Token(), llDelimiter);
+ if (!curPoint) {
+ return {};
+ }
+ points.push_back(*curPoint);
+ }
+
+ if (points.size() < 3) {
+ return {};
+ }
+
+ return TGeoPolygon(points);
+ }
+
+ TGeoPolygon TGeoPolygon::Parse(TStringBuf s, TStringBuf llDelimiter, TStringBuf pointsDelimiter) {
+ auto res = TGeoPolygon::TryParse(s, llDelimiter, pointsDelimiter);
+ if (!res) {
+ ythrow yexception() << "Can't parse polygon from input string: " << s;
+ }
+ return *res;
+ }
+} // namespace NGeo
diff --git a/library/cpp/geo/polygon.h b/library/cpp/geo/polygon.h
new file mode 100644
index 0000000000..1528345fec
--- /dev/null
+++ b/library/cpp/geo/polygon.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include "point.h"
+#include "window.h"
+
+#include <util/ysaveload.h>
+#include <util/generic/algorithm.h>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/generic/yexception.h>
+#include <util/stream/output.h>
+#include <util/string/cast.h>
+#include <util/string/join.h>
+#include <util/string/split.h>
+
+#include <algorithm>
+#include <functional>
+
+namespace NGeo {
+ class TGeoPolygon {
+ private:
+ TVector<TGeoPoint> Points_;
+ TGeoWindow Window_;
+
+ public:
+ TGeoPolygon() = default;
+
+ explicit TGeoPolygon(const TVector<TGeoPoint>& points)
+ : Points_(points)
+ {
+ CalcWindow();
+ }
+
+ const TVector<TGeoPoint>& GetPoints() const {
+ return Points_;
+ }
+
+ const TGeoWindow& GetWindow() const {
+ return Window_;
+ }
+
+ void swap(TGeoPolygon& o) noexcept {
+ Points_.swap(o.Points_);
+ Window_.swap(o.Window_);
+ }
+
+ bool IsValid() const noexcept {
+ return !Points_.empty() && Window_.IsValid();
+ }
+
+ bool operator!() const {
+ return !IsValid();
+ }
+
+ /**
+ * try to parse TGeoPolygon from string which stores points
+ * coords are separated by llDelimiter, points are separated by pointsDelimiter
+ * return parsed TGeoPolygon on success, otherwise throw exception
+ */
+ static TGeoPolygon Parse(TStringBuf s, TStringBuf llDelimiter = ",", TStringBuf pointsDelimiter = TStringBuf(" "));
+
+ /**
+ * try to parse TGeoPolygon from string which stores points
+ * coords are separated by llDelimiter, points are separated by pointsDelimiter
+ * return TMaybe of parsed TGeoPolygon on success, otherwise return empty TMaybe
+ */
+ static TMaybe<TGeoPolygon> TryParse(TStringBuf s, TStringBuf llDelimiter = ",", TStringBuf pointsDelimiter = TStringBuf(" "));
+
+ private:
+ void CalcWindow() {
+ auto getLon = std::mem_fn(&TGeoPoint::Lon);
+ double lowerX = MinElementBy(Points_.begin(), Points_.end(), getLon)->Lon();
+ double upperX = MaxElementBy(Points_.begin(), Points_.end(), getLon)->Lon();
+
+ auto getLat = std::mem_fn(&TGeoPoint::Lat);
+ double lowerY = MinElementBy(Points_.begin(), Points_.end(), getLat)->Lat();
+ double upperY = MaxElementBy(Points_.begin(), Points_.end(), getLat)->Lat();
+
+ Window_ = TGeoWindow{TGeoPoint{lowerX, lowerY}, TGeoPoint{upperX, upperY}};
+ }
+ };
+
+ inline bool operator==(const TGeoPolygon& p1, const TGeoPolygon& p2) {
+ return p1.GetPoints() == p2.GetPoints();
+ }
+
+ inline bool operator!=(const TGeoPolygon& p1, const TGeoPolygon& p2) {
+ return !(p1 == p2);
+ }
+} // namespace NGeo
diff --git a/library/cpp/geo/size.cpp b/library/cpp/geo/size.cpp
new file mode 100644
index 0000000000..f1bd8ab763
--- /dev/null
+++ b/library/cpp/geo/size.cpp
@@ -0,0 +1,31 @@
+#include "size.h"
+
+#include "util.h"
+
+namespace NGeo {
+ const double TSize::BadWidth = -1.;
+ const double TSize::BadHeight = -1.;
+
+ namespace {
+ bool IsNonNegativeSize(double width, double height) {
+ return width >= 0. && height >= 0.;
+ }
+ } // namespace
+
+ TSize TSize::Parse(TStringBuf s, TStringBuf delimiter) {
+ const auto& [width, height] = PairFromString(s, delimiter);
+ Y_ENSURE_EX(IsNonNegativeSize(width, height), TBadCastException() << "Negative window size");
+ return {width, height};
+ }
+
+ TMaybe<TSize> TSize::TryParse(TStringBuf s, TStringBuf delimiter) {
+ std::pair<double, double> lonLat;
+ if (!TryPairFromString(lonLat, s, delimiter)) {
+ return {};
+ }
+ if (!IsNonNegativeSize(lonLat.first, lonLat.second)) {
+ return {};
+ }
+ return TSize{lonLat.first, lonLat.second};
+ }
+} // namespace NGeo
diff --git a/library/cpp/geo/size.h b/library/cpp/geo/size.h
new file mode 100644
index 0000000000..b619c6d899
--- /dev/null
+++ b/library/cpp/geo/size.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <util/generic/string.h>
+#include <util/stream/output.h>
+#include <util/string/cast.h>
+
+namespace NGeo {
+ class TSize {
+ public:
+ TSize(double width, double height) noexcept
+ : Width_(width)
+ , Height_(height)
+ {
+ }
+
+ explicit TSize(double size) noexcept
+ : Width_(size)
+ , Height_(size)
+ {
+ }
+
+ TSize() noexcept
+ : Width_(BadWidth)
+ , Height_(BadHeight)
+ {
+ }
+
+ double GetWidth() const noexcept {
+ return Width_;
+ }
+
+ double GetHeight() const noexcept {
+ return Height_;
+ }
+
+ void swap(TSize& s) noexcept {
+ std::swap(Width_, s.Width_);
+ std::swap(Height_, s.Height_);
+ }
+
+ bool IsValid() const {
+ return (Width_ != BadWidth) && (Height_ != BadHeight);
+ }
+
+ void Stretch(double multiplier) {
+ Width_ *= multiplier;
+ Height_ *= multiplier;
+ }
+
+ void Inflate(double additionX, double additionY) {
+ Width_ += additionX;
+ Height_ += additionY;
+ }
+
+ bool operator!() const {
+ return !IsValid();
+ }
+
+ TString ToCgiStr() const {
+ TString s = ToString(Width_);
+ s.append(',');
+ s.append(ToString(Height_));
+ return s;
+ }
+
+ /**
+ * try to parse TSize
+ * return parsed TSize on success, otherwise throw exception
+ */
+ static TSize Parse(TStringBuf s, TStringBuf delimiter = TStringBuf(","));
+
+ /**
+ * try to parse TSize
+ * return TMaybe of parsed TSize on success, otherwise return empty TMaybe
+ */
+ static TMaybe<TSize> TryParse(TStringBuf s, TStringBuf delimiter = TStringBuf(","));
+
+ private:
+ double Width_;
+ double Height_;
+ static const double BadWidth;
+ static const double BadHeight;
+ };
+
+ inline bool operator==(const TSize& p1, const TSize& p2) {
+ return p1.GetHeight() == p2.GetHeight() && p1.GetWidth() == p2.GetWidth();
+ }
+} // namespace NGeo
+
+template <>
+inline void Out<NGeo::TSize>(IOutputStream& o, const NGeo::TSize& s) {
+ o << '<' << s.GetWidth() << ", " << s.GetHeight() << '>';
+}
diff --git a/library/cpp/geo/style/ya.make b/library/cpp/geo/style/ya.make
new file mode 100644
index 0000000000..f72d50f27e
--- /dev/null
+++ b/library/cpp/geo/style/ya.make
@@ -0,0 +1,8 @@
+CPP_STYLE_TEST_14()
+
+STYLE(
+ library/cpp/geo/**/*.cpp
+ library/cpp/geo/**/*.h
+)
+
+END()
diff --git a/library/cpp/geo/ut/load_save_helper_ut.cpp b/library/cpp/geo/ut/load_save_helper_ut.cpp
new file mode 100644
index 0000000000..f251f56630
--- /dev/null
+++ b/library/cpp/geo/ut/load_save_helper_ut.cpp
@@ -0,0 +1,90 @@
+#include "load_save_helper.h"
+#include "point.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+#include <util/stream/str.h>
+#include <util/ysaveload.h>
+
+namespace {
+ void CheckSave(const NGeo::TGeoPoint& point) {
+ TStringStream output;
+ ::Save(&output, point);
+ TStringStream answer;
+ ::Save(&answer, static_cast<double>(point.Lon()));
+ ::Save(&answer, static_cast<double>(point.Lat()));
+ UNIT_ASSERT_EQUAL(output.Str(), answer.Str());
+ }
+
+ void CheckLoad(const double x, const double y) {
+ TStringStream input;
+ ::Save(&input, x);
+ ::Save(&input, y);
+ NGeo::TGeoPoint output;
+ ::Load(&input, output);
+
+ const double eps = 1.E-8;
+ UNIT_ASSERT_DOUBLES_EQUAL(static_cast<double>(output.Lon()), x, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(static_cast<double>(output.Lat()), y, eps);
+ }
+
+ void CheckLoadAfterSavePointLL(double x, double y) {
+ NGeo::TGeoPoint answer = {x, y};
+ TStringStream iostream;
+ ::Save(&iostream, answer);
+ NGeo::TGeoPoint output;
+ ::Load(&iostream, output);
+
+ const double eps = 1.E-8;
+ UNIT_ASSERT_DOUBLES_EQUAL(static_cast<double>(output.Lon()), x, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(static_cast<double>(output.Lat()), y, eps);
+ }
+
+ void CheckLoadAfterSaveWindowLL(NGeo::TGeoPoint center, NGeo::TSize size) {
+ NGeo::TGeoWindow answer = {center, size};
+ TStringStream iostream;
+ ::Save(&iostream, answer);
+ NGeo::TGeoWindow output;
+ ::Load(&iostream, output);
+ UNIT_ASSERT_EQUAL(output.GetCenter(), answer.GetCenter());
+ UNIT_ASSERT_EQUAL(output.GetSize(), answer.GetSize());
+ }
+} // namespace
+
+Y_UNIT_TEST_SUITE(TSaveLoadForPointLL) {
+ Y_UNIT_TEST(TestSave) {
+ // {27.561481, 53.902496} Minsk Lon and Lat
+ CheckSave({27.561481, 53.902496});
+ CheckSave({-27.561481, 53.902496});
+ CheckSave({27.561481, -53.902496});
+ CheckSave({-27.561481, -53.902496});
+ }
+
+ Y_UNIT_TEST(TestLoad) {
+ CheckLoad(27.561481, 53.902496);
+ CheckLoad(-27.561481, 53.902496);
+ CheckLoad(27.561481, -53.902496);
+ CheckLoad(-27.561481, -53.902496);
+ }
+
+ Y_UNIT_TEST(TestSaveLoad) {
+ CheckLoadAfterSavePointLL(27.561481, 53.902496);
+ CheckLoadAfterSavePointLL(-27.561481, 53.902496);
+ CheckLoadAfterSavePointLL(27.561481, -53.902496);
+ CheckLoadAfterSavePointLL(-27.561481, -53.902496);
+ CheckLoadAfterSavePointLL(0, 0);
+ }
+}
+
+Y_UNIT_TEST_SUITE(TSaveLoadForWindowLL) {
+ Y_UNIT_TEST(TestSave) {
+ CheckLoadAfterSaveWindowLL({27.561481, 53.902496}, {1, 2});
+ CheckLoadAfterSaveWindowLL({27.561481, 53.902496}, {2, 1});
+ CheckLoadAfterSaveWindowLL({-27.561481, 53.902496}, {1, 2});
+ CheckLoadAfterSaveWindowLL({-27.561481, 53.902496}, {2, 1});
+ CheckLoadAfterSaveWindowLL({27.561481, -53.902496}, {1, 2});
+ CheckLoadAfterSaveWindowLL({27.561481, -53.902496}, {2, 1});
+ CheckLoadAfterSaveWindowLL({-27.561481, -53.902496}, {1, 2});
+ CheckLoadAfterSaveWindowLL({-27.561481, -53.902496}, {2, 1});
+ CheckLoadAfterSaveWindowLL({0, 0}, {0, 0});
+ }
+}
diff --git a/library/cpp/geo/ut/point_ut.cpp b/library/cpp/geo/ut/point_ut.cpp
new file mode 100644
index 0000000000..bbf8f32cea
--- /dev/null
+++ b/library/cpp/geo/ut/point_ut.cpp
@@ -0,0 +1,171 @@
+#include "point.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NGeo;
+
+namespace {
+ void CheckMercator(TGeoPoint input, TMercatorPoint answer, double eps = 1.e-8) {
+ auto output = LLToMercator(input);
+ UNIT_ASSERT_DOUBLES_EQUAL(output.X(), answer.X(), eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(output.Y(), answer.Y(), eps);
+ }
+
+ void CheckGeo(TMercatorPoint input, TGeoPoint answer, double eps = 1.e-8) {
+ auto output = MercatorToLL(input);
+ UNIT_ASSERT_DOUBLES_EQUAL(output.Lon(), answer.Lon(), eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(output.Lat(), answer.Lat(), eps);
+ }
+} // namespace
+
+Y_UNIT_TEST_SUITE(TPointTest) {
+ Y_UNIT_TEST(TestGeoPointFromString) {
+ UNIT_ASSERT_EQUAL(TGeoPoint::Parse("0.15,0.67"),
+ TGeoPoint(0.15, 0.67));
+ UNIT_ASSERT_EQUAL(TGeoPoint::Parse("-52.,-27."),
+ TGeoPoint(-52., -27.));
+ UNIT_ASSERT_EQUAL(TGeoPoint::Parse("0.15 0.67", " "),
+ TGeoPoint(0.15, 0.67));
+ UNIT_ASSERT_EQUAL(TGeoPoint::Parse("-27. -52", " "),
+ TGeoPoint(-27., -52.));
+ UNIT_ASSERT_EQUAL(TGeoPoint::Parse("182,55"),
+ TGeoPoint(182., 55.));
+
+ // current behavior
+ UNIT_ASSERT(TGeoPoint::TryParse(TString{}).Empty());
+ UNIT_ASSERT_EXCEPTION(TGeoPoint::Parse("Hello,world"), TBadCastException);
+ UNIT_ASSERT_EXCEPTION(TGeoPoint::Parse("640 17", " "), TBadCastException);
+ UNIT_ASSERT_EXCEPTION(TGeoPoint::Parse("50.,100"), TBadCastException);
+ UNIT_ASSERT_EQUAL(TGeoPoint::Parse(" 0.01, 0.01"), TGeoPoint(0.01, 0.01));
+ UNIT_ASSERT_EXCEPTION(TGeoPoint::Parse("0.01 , 0.01"), TBadCastException);
+ UNIT_ASSERT_EXCEPTION(TGeoPoint::Parse("0.01, 0.01 "), TBadCastException);
+ }
+}
+
+Y_UNIT_TEST_SUITE(TConversionTest) {
+ Y_UNIT_TEST(TestConversionGeoToMercator) {
+ // test data is obtained using PostGIS:
+ // SELECT ST_AsText(ST_Transform(ST_SetSRID(ST_MakePoint(lon, lat), 4326), 3395))
+
+ CheckMercator({27.547028, 53.893962}, {3066521.12982805, 7115552.47353991});
+ CheckMercator({-70.862782, -53.002613}, {-7888408.80843475, -6949331.55685883});
+ CheckMercator({37.588536, 55.734004}, {4184336.68718463, 7470303.90973406});
+ CheckMercator({0., 0.}, {0, 0});
+ }
+
+ Y_UNIT_TEST(TestConversionMercatorToGeo) {
+ // test data is obtained using PostGIS:
+ // SELECT ST_AsText(ST_Transform(ST_SetSRID(ST_MakePoint(X, Y), 3395), 4326))
+
+ CheckGeo({3066521, 7115552}, {27.5470268337348, 53.8939594873943});
+ CheckGeo({-7888409, -6949332}, {-70.8627837208599, -53.0026154014032});
+ CheckGeo({4184336, 7470304}, {37.5885298269154, 55.734004457522});
+ CheckGeo({0, 0}, {0., 0.});
+ }
+
+ Y_UNIT_TEST(TestExactConversion) {
+ // Zero maps to zero with no epsilons
+ UNIT_ASSERT_VALUES_EQUAL(LLToMercator({0., 0.}).X(), 0.);
+ UNIT_ASSERT_VALUES_EQUAL(LLToMercator({0., 0.}).Y(), 0.);
+ UNIT_ASSERT_VALUES_EQUAL(MercatorToLL({0., 0.}).Lon(), 0.);
+ UNIT_ASSERT_VALUES_EQUAL(MercatorToLL({0., 0.}).Lat(), 0.);
+ }
+
+ Y_UNIT_TEST(TestPoles) {
+ UNIT_ASSERT_VALUES_EQUAL(LLToMercator({0, 90}).Y(), std::numeric_limits<double>::infinity());
+ UNIT_ASSERT_VALUES_EQUAL(LLToMercator({0, -90}).Y(), -std::numeric_limits<double>::infinity());
+
+ UNIT_ASSERT_VALUES_EQUAL(MercatorToLL({0, std::numeric_limits<double>::infinity()}).Lat(), 90.);
+ UNIT_ASSERT_VALUES_EQUAL(MercatorToLL({0, -std::numeric_limits<double>::infinity()}).Lat(), -90.);
+ }
+
+ Y_UNIT_TEST(TestNearPoles) {
+ // Reference values were obtained using mpmath library (floating-point arithmetic with arbitrary precision)
+ CheckMercator({0., 89.9}, {0., 44884542.157175040}, 1.e-6);
+ CheckMercator({0., 89.99}, {0., 59570746.872518855}, 1.e-5);
+ CheckMercator({0., 89.999}, {0., 74256950.065173316}, 1.e-4);
+ CheckMercator({0., 89.9999}, {0., 88943153.242600886}, 1.e-3);
+ CheckMercator({0., 89.99999}, {0., 103629356.41987618}, 1.e-1);
+ CheckMercator({0., 89.999999}, {0., 118315559.59714996}, 1.e-1);
+ CheckMercator({0., 89.9999999}, {0., 133001762.77442373}, 1.e-0);
+ CheckMercator({0., 89.99999999}, {0., 147687965.95169749}, 1.e+1);
+ CheckMercator({0., 89.9999999999999857891452847979962825775146484375}, {0., 233563773.75716050}, 1.e+7);
+
+ CheckGeo({0., 233563773.75716050}, {0., 89.9999999999999857891452847979962825775146484375}, 1.e-15);
+ CheckGeo({0., 147687965.95169749}, {0., 89.99999999}, 1.e-13);
+ CheckGeo({0., 133001762.77442373}, {0., 89.9999999}, 1.e-13);
+ CheckGeo({0., 118315559.59714996}, {0., 89.999999}, 1.e-13);
+ CheckGeo({0., 103629356.41987618}, {0., 89.99999}, 1.e-13);
+ CheckGeo({0., 88943153.242600886}, {0., 89.9999}, 1.e-13);
+ CheckGeo({0., 74256950.065173316}, {0., 89.999}, 1.e-13);
+ CheckGeo({0., 59570746.872518855}, {0., 89.99}, 1.e-13);
+ CheckGeo({0., 44884542.157175040}, {0., 89.9}, 1.e-13);
+ }
+
+ Y_UNIT_TEST(TestVisibleRange) {
+ UNIT_ASSERT(TGeoPoint(37., 55.).IsVisibleOnMap());
+ UNIT_ASSERT(!TGeoPoint(37., 86.).IsVisibleOnMap());
+ UNIT_ASSERT(TGeoPoint(37., -85.).IsVisibleOnMap());
+ UNIT_ASSERT(!TGeoPoint(37., -90.).IsVisibleOnMap());
+ }
+
+ Y_UNIT_TEST(TestRoundTripGeoMercatorGeo) {
+ auto check = [](double longitude, double latitude) {
+ auto pt = MercatorToLL(LLToMercator(TGeoPoint{longitude, latitude}));
+ UNIT_ASSERT_DOUBLES_EQUAL_C(longitude, pt.Lon(), 1.e-12, "longitude for point (" << longitude << ", " << latitude << ")");
+ UNIT_ASSERT_DOUBLES_EQUAL_C(latitude, pt.Lat(), 1.e-8, "latitude for point (" << longitude << ", " << latitude << ")");
+ };
+
+ check(37., 55.);
+ check(0.1, 0.1);
+ check(0.2, 89.9);
+ check(181., -42.);
+ check(362., -43.);
+ check(-183., -87.);
+ check(1000., -77.);
+ }
+
+ Y_UNIT_TEST(TestRoundTripMercatorGeoMercator) {
+ auto check = [](double x, double y) {
+ auto pt = LLToMercator(MercatorToLL(TMercatorPoint{x, y}));
+ UNIT_ASSERT_DOUBLES_EQUAL_C(x, pt.X(), 1.e-4, "x for point (" << x << ", " << y << ")");
+ UNIT_ASSERT_DOUBLES_EQUAL_C(y, pt.Y(), 1.e-4, "y for point (" << x << ", " << y << ")");
+ };
+
+ check(100., 200.);
+ check(-123456., 654321.);
+ check(5.e7, 1.23456789);
+ check(1.e8, -2.e7);
+ }
+}
+
+Y_UNIT_TEST_SUITE(TestDistance) {
+ Y_UNIT_TEST(TestGeodeticDistance) {
+ const TGeoPoint minsk(27.55, 53.916667);
+ const TGeoPoint moscow(37.617778, 55.755833);
+ const TGeoPoint newYork(-73.994167, 40.728333);
+ const TGeoPoint sydney(151.208333, -33.869444);
+
+ const double eps = 1.E-6; // absolute error
+
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(minsk, minsk), 0.0, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(minsk, moscow), 677190.08871321136, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(minsk, newYork), 7129091.7536358498, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(minsk, sydney), 15110861.267782301, eps);
+
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(moscow, minsk), 677190.08871321136, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(moscow, moscow), 0.0, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(moscow, newYork), 7519517.2469277605, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(moscow, sydney), 14467193.188083574, eps);
+
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(newYork, minsk), 7129091.7536358498, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(newYork, moscow), 7519517.2469277605, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(newYork, newYork), 0.0, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(newYork, sydney), 15954603.669226252, eps);
+
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(sydney, minsk), 15110861.267782301, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(sydney, moscow), 14467193.188083574, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(sydney, newYork), 15954603.669226252, eps);
+ UNIT_ASSERT_DOUBLES_EQUAL(GeodeticDistance(sydney, sydney), 0.0, eps);
+ }
+}
diff --git a/library/cpp/geo/ut/polygon_ut.cpp b/library/cpp/geo/ut/polygon_ut.cpp
new file mode 100644
index 0000000000..cd9dee9759
--- /dev/null
+++ b/library/cpp/geo/ut/polygon_ut.cpp
@@ -0,0 +1,34 @@
+#include "polygon.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NGeo;
+
+Y_UNIT_TEST_SUITE(TGeoPolygonTest) {
+ Y_UNIT_TEST(TestEmptyPolygon) {
+ TGeoPolygon empty;
+ UNIT_ASSERT(!empty);
+ UNIT_ASSERT(!empty.IsValid());
+ }
+
+ Y_UNIT_TEST(TestPolygon) {
+ TGeoPolygon polygon({{1., 2.}, {2., 1.}, {2., 4.}, {1., 3.}});
+ UNIT_ASSERT(polygon.IsValid());
+ UNIT_ASSERT_EQUAL(polygon.GetWindow(),
+ TGeoWindow(TGeoPoint(1., 1.), TGeoPoint(2., 4.)));
+ }
+
+ Y_UNIT_TEST(TestParse) {
+ UNIT_ASSERT_EQUAL(TGeoPolygon::Parse(TString{"1.23,5.67 7.89,10.11 11.10,9.87"}),
+ NGeo::TGeoPolygon({{1.23, 5.67}, {7.89, 10.11}, {11.10, 9.87}}));
+ UNIT_ASSERT_EQUAL(TGeoPolygon::Parse(TString{"1.23,5.67 7.89,10.11 11.10,9.87 6.54,3.21"}),
+ NGeo::TGeoPolygon({{1.23, 5.67}, {7.89, 10.11}, {11.10, 9.87}, {6.54, 3.21}}));
+
+ UNIT_ASSERT(TGeoPolygon::TryParse(TString{"1.23,5.67 7.89,10.11"}).Empty());
+ UNIT_ASSERT_EQUAL(TGeoPolygon::Parse(TString{"1.23+5.67~7.89+10.11~11.10+9.87"}, "+", "~"),
+ NGeo::TGeoPolygon({{1.23, 5.67}, {7.89, 10.11}, {11.10, 9.87}}));
+
+ UNIT_ASSERT_EQUAL(TGeoPolygon::Parse(TString{"1.23+5.67+~7.89+10.11+~11.10+9.87"}, "+", "+~"),
+ NGeo::TGeoPolygon({{1.23, 5.67}, {7.89, 10.11}, {11.10, 9.87}}));
+ }
+}
diff --git a/library/cpp/geo/ut/size_ut.cpp b/library/cpp/geo/ut/size_ut.cpp
new file mode 100644
index 0000000000..41b4a2c257
--- /dev/null
+++ b/library/cpp/geo/ut/size_ut.cpp
@@ -0,0 +1,29 @@
+#include "size.h"
+
+#include <library/cpp/testing/unittest/registar.h>
+#include <util/generic/maybe.h>
+
+using namespace NGeo;
+
+Y_UNIT_TEST_SUITE(TSizeTest) {
+ Y_UNIT_TEST(TestFromString) {
+ UNIT_ASSERT_EQUAL(TSize::Parse("0.15,0.67"), TSize(0.15, 0.67));
+ UNIT_ASSERT_EQUAL(TSize::Parse("0.15 0.67", " "), TSize(0.15, 0.67));
+
+ UNIT_ASSERT_EXCEPTION(TSize::Parse(""), TBadCastException);
+ UNIT_ASSERT_EXCEPTION(TSize::Parse("Hello,world"), TBadCastException);
+ UNIT_ASSERT_EXCEPTION(TSize::Parse("-1,-1"), TBadCastException);
+
+ UNIT_ASSERT_EQUAL(TSize::Parse("424242 50", " "), TSize(424242., 50.));
+ UNIT_ASSERT_EQUAL(TSize::Parse("50.,424242"), TSize(50., 424242.));
+ UNIT_ASSERT_EQUAL(TSize::Parse(" 0.01, 0.01"), TSize(0.01, 0.01));
+ UNIT_ASSERT_EXCEPTION(TSize::Parse("0.01 ,0.01"), TBadCastException);
+ UNIT_ASSERT_EXCEPTION(TSize::Parse("0.01,0.01 "), TBadCastException);
+ }
+
+ Y_UNIT_TEST(TestTryFromString) {
+ UNIT_ASSERT(TSize::TryParse("1,2"));
+ UNIT_ASSERT(!TSize::TryParse("-1,-2"));
+ UNIT_ASSERT(!TSize::TryParse("1,2a"));
+ }
+}
diff --git a/library/cpp/geo/ut/util_ut.cpp b/library/cpp/geo/ut/util_ut.cpp
new file mode 100644
index 0000000000..ebd86cfbd8
--- /dev/null
+++ b/library/cpp/geo/ut/util_ut.cpp
@@ -0,0 +1,36 @@
+#include <library/cpp/geo/util.h>
+
+#include <library/cpp/testing/unittest/registar.h>
+
+using namespace NGeo;
+
+Y_UNIT_TEST_SUITE(TGeoUtilTest) {
+ Y_UNIT_TEST(TestPointFromString) {
+ UNIT_ASSERT_EQUAL(PairFromString("27.56,53.90"), (std::pair<double, double>(27.56, 53.90)));
+ UNIT_ASSERT_EQUAL(PairFromString("27.56 53.90", " "), (std::pair<double, double>(27.56, 53.90)));
+ UNIT_ASSERT_EQUAL(PairFromString("27.56@@53.90", "@@"), (std::pair<double, double>(27.56, 53.90)));
+ UNIT_ASSERT_EXCEPTION(PairFromString("27.56@@53.90", "@"), TBadCastException);
+ UNIT_ASSERT_EXCEPTION(PairFromString(""), TBadCastException);
+ }
+
+ Y_UNIT_TEST(TestTryPointFromString) {
+ std::pair<double, double> point;
+
+ UNIT_ASSERT(TryPairFromString(point, "27.56,53.90"));
+ UNIT_ASSERT_EQUAL(point, (std::pair<double, double>(27.56, 53.90)));
+
+ UNIT_ASSERT(TryPairFromString(point, "27.56 53.90", " "));
+ UNIT_ASSERT_EQUAL(point, (std::pair<double, double>(27.56, 53.90)));
+
+ UNIT_ASSERT(TryPairFromString(point, "27.56@@53.90", "@@"));
+ UNIT_ASSERT_EQUAL(point, (std::pair<double, double>(27.56, 53.90)));
+
+ UNIT_ASSERT(!TryPairFromString(point, "27.56@@53.90", "@"));
+ UNIT_ASSERT(!TryPairFromString(point, ""));
+ }
+
+ Y_UNIT_TEST(TestVisibleMapBound) {
+ const double expectedLat = MercatorToLL(TMercatorPoint(0., LLToMercator(TGeoPoint(180., 0.)).X())).Lat();
+ UNIT_ASSERT_DOUBLES_EQUAL(VISIBLE_LATITUDE_BOUND, expectedLat, 1.e-14);
+ }
+}
diff --git a/library/cpp/geo/ut/window_ut.cpp b/library/cpp/geo/ut/window_ut.cpp
new file mode 100644
index 0000000000..194fb4e735
--- /dev/null
+++ b/library/cpp/geo/ut/window_ut.cpp
@@ -0,0 +1,547 @@
+#include "window.h"
+#include <library/cpp/testing/unittest/registar.h>
+#include <util/generic/ymath.h>
+
+using namespace NGeo;
+
+namespace {
+ constexpr double DEFAULT_EPS = 1.E-5;
+
+ bool CheckGeoPointEqual(const TGeoPoint& found, const TGeoPoint& expected, const double eps = DEFAULT_EPS) {
+ if (std::isnan(found.Lon()) || std::isnan(found.Lat())) {
+ Cerr << "NaNs found: (" << found.Lon() << ", " << found.Lat() << ")" << Endl;
+ return false;
+ }
+ if (Abs(found.Lon() - expected.Lon()) > eps) {
+ Cerr << "longitude differs: " << found.Lon() << " found, " << expected.Lon() << " expected" << Endl;
+ return false;
+ }
+ if (Abs(found.Lat() - expected.Lat()) > eps) {
+ Cerr << "latitude differs: " << found.Lat() << " found, " << expected.Lat() << " expected" << Endl;
+ return false;
+ }
+ return true;
+ }
+
+ bool CheckSizeEqual(const TSize& found, const TSize& expected, const double eps = DEFAULT_EPS) {
+ if (std::isnan(found.GetWidth()) || std::isnan(found.GetHeight())) {
+ Cerr << "NaNs found: (" << found.GetWidth() << ", " << found.GetHeight() << ")" << Endl;
+ return false;
+ }
+ if (Abs(found.GetWidth() - expected.GetWidth()) > eps) {
+ Cerr << "width differs: " << found.GetWidth() << " found, " << expected.GetWidth() << " expected" << Endl;
+ return false;
+ }
+ if (Abs(found.GetHeight() - expected.GetHeight()) > eps) {
+ Cerr << "height differs: " << found.GetHeight() << " found, " << expected.GetHeight() << " expected" << Endl;
+ return false;
+ }
+ return true;
+ }
+
+ bool CheckGeoWindowEqual(const TGeoWindow& lhs, const TGeoWindow& rhs, const double eps = DEFAULT_EPS) {
+ return CheckGeoPointEqual(lhs.GetCenter(), rhs.GetCenter(), eps) && CheckSizeEqual(lhs.GetSize(), rhs.GetSize(), eps);
+ }
+} // namespace
+
+/**
+ * TGeoWindow
+ */
+Y_UNIT_TEST_SUITE(TGeoWindowTest) {
+ Y_UNIT_TEST(TestParser) {
+ UNIT_ASSERT_EQUAL(TGeoWindow::ParseFromCornersPoints("1.23,5.67", "7.65,3.21"),
+ TGeoWindow(TGeoPoint(1.23, 3.21), TGeoPoint(7.65, 5.67)));
+ UNIT_ASSERT_EQUAL(TGeoWindow::ParseFromCornersPoints("1.23~5.67", "7.65~3.21", "~"),
+ TGeoWindow(TGeoPoint(1.23, 3.21), TGeoPoint(7.65, 5.67)));
+ UNIT_ASSERT_EXCEPTION(TGeoWindow::ParseFromCornersPoints("1.23~5.67", "7.65~3.21"), TBadCastException);
+
+ UNIT_ASSERT(TGeoWindow::TryParseFromCornersPoints("1.23~5.67", "7.65~3.21").Empty());
+ UNIT_ASSERT(TGeoWindow::TryParseFromCornersPoints("1.23,5.67", "7.65,3.21").Defined());
+ UNIT_ASSERT_EQUAL(TGeoWindow::TryParseFromCornersPoints("1.23,5.67", "7.65,3.21").GetRef(),
+ TGeoWindow(TGeoPoint(1.23, 3.21), TGeoPoint(7.65, 5.67)));
+ UNIT_ASSERT(TGeoWindow::TryParseFromCornersPoints("1.23+++5.67+", "7.65+++3.21+", "+++").Empty());
+
+ UNIT_ASSERT_EQUAL(TGeoWindow::ParseFromLlAndSpn("1.23,5.67", "0.1,0.2"),
+ TGeoWindow(TGeoPoint(1.23, 5.67), TSize(0.1, 0.2)));
+ UNIT_ASSERT_EQUAL(TGeoWindow::ParseFromLlAndSpn("1.23~5.67", "0.1~0.2", "~"),
+ TGeoWindow(TGeoPoint(1.23, 5.67), TSize(0.1, 0.2)));
+ UNIT_ASSERT_EXCEPTION(TGeoWindow::ParseFromLlAndSpn("1.23~5.67", "0.1~0.2"), TBadCastException);
+ UNIT_ASSERT(TGeoWindow::TryParseFromLlAndSpn("1.23~5.67", "0.1~0.2").Empty());
+ UNIT_ASSERT(TGeoWindow::TryParseFromLlAndSpn("1.23~5.67", "0.1~0.2", "~").Defined());
+ UNIT_ASSERT_EQUAL(TGeoWindow::TryParseFromLlAndSpn("1.23~5.67", "0.1~0.2", "~").GetRef(),
+ TGeoWindow(TGeoPoint(1.23, 5.67), TSize(0.1, 0.2)));
+ }
+
+ Y_UNIT_TEST(TestConstructor) {
+ TGeoPoint center{55.50, 82.50};
+ TSize size{5.00, 3.00};
+ TGeoWindow window(center, size);
+
+ UNIT_ASSERT_EQUAL(window.GetCenter(), center);
+ UNIT_ASSERT_EQUAL(window.GetSize(), size);
+ }
+
+ Y_UNIT_TEST(TestPoles) {
+ {
+ TGeoWindow northPole{TGeoPoint{180., 90.}, TSize{1.5, 1.5}};
+ UNIT_ASSERT(CheckGeoPointEqual(northPole.GetCenter(), TGeoPoint{180., 90.}));
+ UNIT_ASSERT(CheckGeoPointEqual(northPole.GetLowerLeftCorner(), TGeoPoint{179.25, 88.5}));
+ UNIT_ASSERT(CheckGeoPointEqual(northPole.GetUpperRightCorner(), TGeoPoint{180.75, 90.0}));
+ }
+ {
+ TGeoWindow tallWindow{TGeoPoint{37., 55.}, TSize{10., 180.}};
+ UNIT_ASSERT(CheckGeoPointEqual(tallWindow.GetCenter(), TGeoPoint{37., 55.}));
+ UNIT_ASSERT(CheckGeoPointEqual(tallWindow.GetLowerLeftCorner(), TGeoPoint{32., -90.}));
+ UNIT_ASSERT(CheckGeoPointEqual(tallWindow.GetUpperRightCorner(), TGeoPoint{42., 90.}));
+ }
+ {
+ TGeoWindow world{TGeoPoint{0., 0.}, TSize{360., 180.}};
+ UNIT_ASSERT(CheckGeoPointEqual(world.GetCenter(), TGeoPoint{0., 0.}));
+ UNIT_ASSERT(CheckGeoPointEqual(world.GetLowerLeftCorner(), TGeoPoint{-180., -90.}));
+ UNIT_ASSERT(CheckGeoPointEqual(world.GetUpperRightCorner(), TGeoPoint{180., 90.}));
+ }
+ {
+ TGeoWindow world{TGeoPoint{0., 0.}, TSize{360., 360.}};
+ UNIT_ASSERT(CheckGeoPointEqual(world.GetCenter(), TGeoPoint{0., 0.}));
+ UNIT_ASSERT(CheckGeoPointEqual(world.GetLowerLeftCorner(), TGeoPoint{-180., -90.}));
+ UNIT_ASSERT(CheckGeoPointEqual(world.GetUpperRightCorner(), TGeoPoint{180., 90.}));
+ }
+ }
+
+ Y_UNIT_TEST(TestBigSize) {
+ {
+ TGeoWindow w{TGeoPoint{37., 55.}, TSize{100., 179.}};
+ UNIT_ASSERT(CheckGeoPointEqual(w.GetCenter(), TGeoPoint{37., 55.}));
+ UNIT_ASSERT(CheckGeoPointEqual(w.GetLowerLeftCorner(), TGeoPoint{-13., -89.09540675}));
+ UNIT_ASSERT(CheckGeoPointEqual(w.GetUpperRightCorner(), TGeoPoint{87., 89.90907637}));
+ }
+ }
+
+ Y_UNIT_TEST(TestCenterWhenInitWithCorners) {
+ UNIT_ASSERT(CheckGeoPointEqual(TGeoWindow(TGeoPoint{5.00, 40.00}, TGeoPoint{25.00, 80.00}).GetCenter(), TGeoPoint{15.00, 67.17797}));
+ UNIT_ASSERT(CheckGeoPointEqual(TGeoWindow(TGeoPoint{-5.00, -40.00}, TGeoPoint{-25.00, -80.00}).GetCenter(), TGeoPoint{-15.00, -67.17797}));
+ }
+
+ Y_UNIT_TEST(TestCornersWhenInitWithCenter) {
+ // check lat calc
+ UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{25.00, 50.00}, TSize{10.00, 10.00}).GetLowerLeftCorner().Lat(), 44.73927, DEFAULT_EPS);
+
+ // lat equals to 90
+ UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{25.00, 50.00}, TSize{10.00, 179.99999}).GetUpperRightCorner().Lat(), 90, DEFAULT_EPS);
+
+ // lat equals to -90
+ UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{25.00, -50.00}, TSize{10.00, -179.99999}).GetUpperRightCorner().Lat(), -90, DEFAULT_EPS);
+
+ // check naive lon calc
+ UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{10, 10}, TSize{10, 5}).GetLowerLeftCorner().Lon(), 5, DEFAULT_EPS);
+
+ // check lon equals to 190 (no wrapping)
+ UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{20, 0}, TSize{340, 5}).GetUpperRightCorner().Lon(), 190, DEFAULT_EPS);
+
+ UNIT_ASSERT_DOUBLES_EQUAL(TGeoWindow(TGeoPoint{-40, 0}, TSize{-280, 5}).GetUpperRightCorner().Lon(), -180, DEFAULT_EPS);
+
+ // naive calculating when point is (0, 0)
+ UNIT_ASSERT(CheckGeoPointEqual(TGeoWindow(TGeoPoint{0, 0}, TSize{160, 160}).GetLowerLeftCorner(), TGeoPoint{-80, -80}, DEFAULT_EPS));
+ UNIT_ASSERT(CheckGeoPointEqual(TGeoWindow(TGeoPoint{0, 0}, TSize{160, 160}).GetUpperRightCorner(), TGeoPoint{80, 80}, DEFAULT_EPS));
+ }
+
+ Y_UNIT_TEST(TestCenterSetter) {
+ TGeoPoint center{27.56, 53.90};
+ TGeoWindow window{};
+ window.SetCenter(center);
+ UNIT_ASSERT_EQUAL(window.GetCenter(), center);
+ }
+
+ Y_UNIT_TEST(TestEqualOperator) {
+ TGeoWindow window{TGeoPoint{27.56, 53.90}, TGeoPoint{30.35, 56.89}};
+ UNIT_ASSERT(window == window);
+
+ TGeoWindow anotherWindow{TGeoPoint{60.10, 57.90}, TGeoPoint{60.70, 58.25}};
+ UNIT_ASSERT(!(window == anotherWindow));
+ }
+
+ Y_UNIT_TEST(TestAssignmentOperator) {
+ TGeoWindow lhs{TGeoPoint{27.56, 53.90}, TGeoPoint{30.35, 53.89}};
+ TGeoWindow rhs{};
+ rhs = lhs;
+ UNIT_ASSERT_EQUAL(lhs, rhs);
+ }
+
+ Y_UNIT_TEST(TestContainsMethod) {
+ // you could see cases here https://tech.yandex.ru/maps/jsbox/2.1/rectangle
+ // (pay attention that the first coord is lat and the second one is lon)
+ TGeoWindow window{TGeoPoint{27.45, 53.82}, TGeoPoint{27.65, 53.97}};
+
+ // point is inside the window
+ UNIT_ASSERT(window.Contains(TGeoPoint{27.55, 53.90}));
+
+ // point is to the right of the window
+ UNIT_ASSERT(!window.Contains(TGeoPoint{27.66, 53.95}));
+
+ // point is to the left of the window
+ UNIT_ASSERT(!window.Contains(TGeoPoint{27.44, 53.95}));
+
+ // point is under the window
+ UNIT_ASSERT(!window.Contains(TGeoPoint{27.50, 53.81}));
+
+ // point is above the window
+ UNIT_ASSERT(!window.Contains(TGeoPoint{27.50, 53.98}));
+
+ // point is on border
+ UNIT_ASSERT(window.Contains(TGeoPoint{27.45, 53.86}));
+ UNIT_ASSERT(window.Contains(TGeoPoint{27.65, 53.86}));
+ UNIT_ASSERT(window.Contains(TGeoPoint{27.55, 53.82}));
+ UNIT_ASSERT(window.Contains(TGeoPoint{27.55, 53.97}));
+
+ // negate coord
+ UNIT_ASSERT(TGeoWindow(TGeoPoint{-72.17, -38.82}, TGeoPoint{-68.95, -36.70}).Contains(TGeoPoint{-70.40, -37.75}));
+
+ // special cases
+ UNIT_ASSERT(!TGeoWindow{}.Contains(TGeoPoint{60.09, 57.90}));
+
+ UNIT_ASSERT(TGeoWindow(TGeoPoint{}, TGeoPoint{27.55, 53.90}).Contains(TGeoPoint{27.55, 53.90}));
+ UNIT_ASSERT(TGeoWindow(TGeoPoint{27.55, 53.90}, TGeoPoint{}).Contains(TGeoPoint{27.55, 53.90}));
+ }
+
+ Y_UNIT_TEST(TestIntersectsMethod) {
+ // intersect only by lat
+ UNIT_ASSERT(
+ !Intersects(
+ TGeoWindow{TGeoPoint{27.60, 53.90}, TGeoPoint{27.80, 53.95}},
+ TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}}));
+
+ // intersect only by lon
+ UNIT_ASSERT(
+ !Intersects(
+ TGeoWindow{TGeoPoint{27.35, 54}, TGeoPoint{27.45, 54.10}},
+ TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}}));
+
+ // one inside another
+ UNIT_ASSERT(
+ Intersects(
+ TGeoWindow{TGeoPoint{27.35, 53.90}, TGeoPoint{27.45, 53.95}},
+ TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}}));
+
+ // intersection is point
+ UNIT_ASSERT(
+ !Intersects(
+ TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.70, 54.00}},
+ TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}}));
+
+ // intersection is segment
+ UNIT_ASSERT(
+ !Intersects(
+ TGeoWindow{TGeoPoint{27.40, 53.98}, TGeoPoint{27.70, 54.00}},
+ TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}}));
+
+ // intersection is area
+ UNIT_ASSERT(
+ Intersects(
+ TGeoWindow{TGeoPoint{27.40, 53.90}, TGeoPoint{27.70, 54.00}},
+ TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}}));
+
+ // equal windows
+ TGeoWindow window{TGeoPoint{27.60, 53.88}, TGeoPoint{27.80, 53.98}};
+ UNIT_ASSERT(Intersects(window, window));
+ }
+
+ Y_UNIT_TEST(TestIntersectionMethod) {
+ // non-intersecting window
+ UNIT_ASSERT(
+ !(Intersection(
+ TGeoWindow{TGeoPoint{37.66, 55.66}, TGeoPoint{37.53, 55.64}},
+ TGeoWindow{TGeoPoint{37.67, 55.66}, TGeoPoint{37.69, 55.71}})));
+
+ // one inside another
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Intersection(
+ TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{10.00, 10.00}},
+ TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{2.00, 2.00}})
+ .GetRef(),
+ (TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{2.00, 2.00}})));
+
+ // cross
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Intersection(
+ TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{10.00, 2.00}},
+ TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{2.00, 10.00}})
+ .GetRef(),
+ (TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{2.00, 2.00}})));
+
+ // intersection is a point
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Intersection(
+ TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.70, 54.00}},
+ TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}})
+ .GetRef(),
+ (TGeoWindow{TGeoPoint{27.50, 53.98}, TSize{0, 0}})));
+
+ // intersection is a segment
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Intersection(
+ TGeoWindow{TGeoPoint{27.40, 53.98}, TGeoPoint{27.70, 54.00}},
+ TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}})
+ .GetRef(),
+ (TGeoWindow{TGeoPoint{27.45, 53.98}, TSize{0.10, 0}})));
+
+ // intersection is area
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Intersection(
+ TGeoWindow{TGeoPoint{27.40, 53.90}, TGeoPoint{27.70, 54.00}},
+ TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}})
+ .GetRef(),
+ (TGeoWindow{TGeoPoint{27.40, 53.90}, TGeoPoint{27.50, 53.98}})));
+
+ // special cases
+ UNIT_ASSERT(
+ !(Intersection(
+ TGeoWindow{TGeoPoint{27.30, 53.88}, TGeoPoint{27.50, 53.98}},
+ TGeoWindow{})));
+ }
+
+ Y_UNIT_TEST(TestDistanceMethod) {
+ // one window inside another
+ UNIT_ASSERT_DOUBLES_EQUAL(
+ (TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.80, 54.10}})
+ .Distance(TGeoWindow{TGeoPoint{27.55, 54.00}, TGeoPoint{27.70, 54.07}}),
+ 0,
+ 1.E-5);
+
+ // gap only by lon
+ UNIT_ASSERT_DOUBLES_EQUAL(
+ (TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.60, 54.10}})
+ .Distance(TGeoWindow{TGeoPoint{27.69, 54.10}, TGeoPoint{27.90, 54.20}}),
+ 0.052773,
+ 1.E-5);
+
+ // gap only by lat
+ UNIT_ASSERT_DOUBLES_EQUAL(
+ (TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.60, 54.10}})
+ .Distance(TGeoWindow{TGeoPoint{27.50, 54.20}, TGeoPoint{27.70, 54.30}}),
+ 0.1,
+ 1.E-5);
+
+ // gap by lot and lat, you can calculate answer using two previous tests
+ UNIT_ASSERT_DOUBLES_EQUAL(
+ (TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.60, 54.10}}
+ .Distance(TGeoWindow{TGeoPoint{27.69, 54.20}, TGeoPoint{27.70, 54.30}})),
+ 0.11304,
+ 1.E-5);
+
+ // negate coord
+ UNIT_ASSERT_DOUBLES_EQUAL(
+ (TGeoWindow{TGeoPoint{-27.50, -53.98}, TGeoPoint{-27.60, -54.10}}
+ .Distance(TGeoWindow{TGeoPoint{-27.69, -54.20}, TGeoPoint{-27.70, -54.30}})),
+ 0.11304,
+ 1.E-5);
+ }
+
+ Y_UNIT_TEST(TestApproxDistanceMethod) {
+ // point inside
+ UNIT_ASSERT_DOUBLES_EQUAL(
+ (TGeoWindow{TGeoPoint{27.50, 53.98}, TGeoPoint{27.80, 54.10}})
+ .GetApproxDistance(TGeoPoint{27.60, 54.05}),
+ 0,
+ 1.E-5);
+
+ // gap only by lon
+ UNIT_ASSERT_DOUBLES_EQUAL(
+ (TGeoWindow{TGeoPoint{27.50, 54.00}, TGeoPoint{27.60, 54.10}})
+ .GetApproxDistance(TGeoPoint{27.70, 54.05}),
+ 6535.3,
+ 0.1);
+
+ // gap only by lat
+ UNIT_ASSERT_DOUBLES_EQUAL(
+ (TGeoWindow{TGeoPoint{27.50, 54.00}, TGeoPoint{27.60, 54.10}})
+ .GetApproxDistance(TGeoPoint{27.55, 53.95}),
+ 5566.0,
+ 0.1);
+
+ // gap by lot and lat
+ UNIT_ASSERT_DOUBLES_EQUAL(
+ (TGeoWindow{TGeoPoint{27.50, 54.00}, TGeoPoint{27.60, 54.10}})
+ .GetApproxDistance(TGeoPoint{27.70, 54.20}),
+ 12900.6,
+ 0.1);
+
+ // negate coord
+ UNIT_ASSERT_DOUBLES_EQUAL(
+ (TGeoWindow{TGeoPoint{-27.50, -54.00}, TGeoPoint{-27.60, -54.10}})
+ .GetApproxDistance(TGeoPoint{-27.70, -54.20}),
+ 12900.6,
+ 0.1);
+ }
+
+ Y_UNIT_TEST(TestUnionMethod) {
+ // one inside another
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Union(
+ TGeoWindow{TGeoPoint{37.00, 55.00}, TSize{2.00, 3.00}},
+ TGeoWindow{TGeoPoint{37.10, 55.20}, TSize{1.50, 1.00}}),
+ TGeoWindow(TGeoPoint{37.00, 55.00}, TSize{2.00, 3.00})));
+
+ // non-intersecting windows
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Union(
+ TGeoWindow{TGeoPoint{37.00, 55.00}, TGeoPoint{37.10, 55.10}},
+ TGeoWindow{TGeoPoint{37.20, 55.20}, TGeoPoint{37.30, 55.30}}),
+ TGeoWindow(TGeoPoint{37.00, 55.00}, TGeoPoint{37.30, 55.30})));
+
+ // negate coords, one inside another
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Union(
+ TGeoWindow{TGeoPoint{-57.62, -20.64}, TSize{2.00, 4.00}},
+ TGeoWindow{TGeoPoint{-57.62, -20.64}, TSize{12.00, 10.00}}),
+ TGeoWindow(TGeoPoint{-57.62, -20.64}, TSize{12.00, 10.00}), 1.E-2));
+
+ // cross
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Union(
+ TGeoWindow{TGeoPoint{-3.82, 5.52}, TGeoPoint{0.10, 6.50}},
+ TGeoWindow{TGeoPoint{-1.5, 4.20}, TGeoPoint{-0.5, 7.13}}),
+ TGeoWindow(TGeoPoint{-3.82, 4.20}, TGeoPoint{0.10, 7.13})));
+
+ // special cases
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Union(
+ TGeoWindow{TGeoPoint{-3.82, 5.52}, TGeoPoint{0.10, 6.50}},
+ TGeoWindow{}),
+ TGeoWindow(TGeoPoint{-3.82, 5.52}, TGeoPoint{361., 181.})));
+
+ UNIT_ASSERT(CheckGeoWindowEqual(
+ Union(
+ TGeoWindow{},
+ TGeoWindow{TGeoPoint{-3.82, 5.52}, TGeoPoint{0.10, 6.50}}),
+ TGeoWindow(TGeoPoint{-3.82, 5.52}, TGeoPoint{361., 181.})));
+ }
+
+ Y_UNIT_TEST(TestStretchMethod) {
+ TSize size{0.5, 1};
+ TGeoPoint center{27.40, 53.90};
+ TGeoWindow window{};
+ double multiplier = 0;
+
+ // multiplier is less than 1.
+ window = {center, size};
+ multiplier = 0.5;
+
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{27.14999, 53.39699}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{27.65000, 54.39699}));
+
+ window.Stretch(multiplier);
+ UNIT_ASSERT(CheckGeoWindowEqual(window, TGeoWindow{center, TSize{0.25, 0.5}}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{27.27499, 53.64925}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{27.52500, 54.14924}));
+
+ // multiplier is greater than 1.
+ window = {center, size};
+ multiplier = 2.2;
+
+ window.Stretch(multiplier);
+ UNIT_ASSERT(CheckGeoWindowEqual(window, TGeoWindow{center, TSize{1.1, 2.2}}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{26.84999, 52.78545}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{27.95000, 54.98545}));
+
+ // invalid multiplier
+ window = {center, size};
+ multiplier = 100.;
+
+ window.Stretch(multiplier);
+ UNIT_ASSERT(CheckGeoWindowEqual(window, TGeoWindow{center, TSize{50, 100}}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{2.40000, -18.88352}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{52.39999, 81.26212}));
+
+ // invalid multiplier
+ window = {center, size};
+ multiplier = 0;
+
+ window.Stretch(multiplier);
+ UNIT_ASSERT(CheckGeoWindowEqual(window, TGeoWindow{center, TSize{0, 0}}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{27.39999, 53.90000}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{27.39999, 53.90000}));
+
+ // invalid multiplier
+ window = {center, size};
+ multiplier = -5.;
+
+ window.Stretch(multiplier);
+ UNIT_ASSERT(CheckGeoWindowEqual(window, TGeoWindow{center, TSize{-2.5, -5}}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetLowerLeftCorner(), TGeoPoint{28.64999, 56.32495}));
+ UNIT_ASSERT(CheckGeoPointEqual(window.GetUpperRightCorner(), TGeoPoint{26.15000, 51.32491}));
+ }
+}
+
+/**
+ * TMercatorWindow
+ */
+Y_UNIT_TEST_SUITE(TMercatorWindowTest) {
+ Y_UNIT_TEST(TestConstructor) {
+ // init with two corners
+ TMercatorPoint lowerLeft{5, 3};
+ TMercatorPoint upperRight{10, 20};
+ TMercatorWindow window{lowerLeft, upperRight};
+
+ UNIT_ASSERT_EQUAL(window.GetWidth(), 5.);
+ UNIT_ASSERT_EQUAL(window.GetHeight(), 17.);
+ UNIT_ASSERT_EQUAL(window.GetCenter(), (TMercatorPoint{7.5, 11.5}));
+
+ TMercatorPoint center{8, 12};
+ TSize size{5, 17};
+ window = {center, size};
+ UNIT_ASSERT_EQUAL(window.GetUpperRightCorner().X(), 10.5);
+ UNIT_ASSERT_EQUAL(window.GetUpperRightCorner().Y(), 20.5);
+ UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner().X(), 5.5);
+ UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner().Y(), 3.5);
+ }
+
+ Y_UNIT_TEST(TestInflateMethod) {
+ TSize size{200, 500};
+ TMercatorPoint center{441, 688};
+ TMercatorWindow window{};
+ int add = 10;
+
+ window = {center, size};
+ UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(341, 438));
+ UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(541, 938));
+ window.Inflate(add);
+ UNIT_ASSERT_EQUAL(window, TMercatorWindow(center, TSize{220, 520}));
+ UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(331, 428));
+ UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(551, 948));
+
+ // negate coords
+ center = {-441, -688};
+ window = {center, size};
+ UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(-541, -938));
+ UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(-341, -438));
+ window.Inflate(add);
+ UNIT_ASSERT_EQUAL(window, TMercatorWindow(center, TSize{220, 520}));
+ UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(-551, -948));
+ UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(-331, -428));
+
+ // size becomes negate
+ size = {6, 12};
+ center = {0, 0};
+ window = {center, size};
+ UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(-3, -6));
+ UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(3, 6));
+
+ add = -20;
+ window.Inflate(add);
+ UNIT_ASSERT_EQUAL(window, TMercatorWindow(center, TSize{-34, -28}));
+ UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(17, 14));
+ UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(-17, -14));
+ UNIT_ASSERT_EQUAL(window.GetSize(), TSize(-34, -28));
+
+ // big add param
+ size = {10, 15};
+ center = {5, 10};
+ window = {center, size};
+
+ add = static_cast<int>(1E5);
+ window.Inflate(add);
+ UNIT_ASSERT_EQUAL(window, TMercatorWindow(center, TSize{200'010, 200'015}));
+ UNIT_ASSERT_EQUAL(window.GetLowerLeftCorner(), TMercatorPoint(-100'000, -99'997.5));
+ UNIT_ASSERT_EQUAL(window.GetUpperRightCorner(), TMercatorPoint(100'010, 100'017.5));
+ }
+}
diff --git a/library/cpp/geo/ut/ya.make b/library/cpp/geo/ut/ya.make
new file mode 100644
index 0000000000..5bd891db1f
--- /dev/null
+++ b/library/cpp/geo/ut/ya.make
@@ -0,0 +1,12 @@
+UNITTEST_FOR(library/cpp/geo)
+
+SRCS(
+ load_save_helper_ut.cpp
+ polygon_ut.cpp
+ point_ut.cpp
+ size_ut.cpp
+ util_ut.cpp
+ window_ut.cpp
+)
+
+END()
diff --git a/library/cpp/geo/util.cpp b/library/cpp/geo/util.cpp
new file mode 100644
index 0000000000..e8d0fc378e
--- /dev/null
+++ b/library/cpp/geo/util.cpp
@@ -0,0 +1,34 @@
+#include "util.h"
+
+#include <math.h>
+#include <util/generic/cast.h>
+#include <util/generic/string.h>
+#include <util/string/cast.h>
+#include <utility>
+
+namespace NGeo {
+ bool TryPairFromString(std::pair<double, double>& res, TStringBuf inputStr, TStringBuf delimiter) {
+ TStringBuf lhsStr;
+ TStringBuf rhsStr;
+
+ double lhs = NAN;
+ double rhs = NAN;
+ if (
+ !inputStr.TrySplit(delimiter, lhsStr, rhsStr) ||
+ !TryFromString<double>(lhsStr, lhs) ||
+ !TryFromString<double>(rhsStr, rhs)) {
+ return false;
+ }
+
+ res = {lhs, rhs};
+ return true;
+ }
+
+ std::pair<double, double> PairFromString(TStringBuf inputStr, TStringBuf delimiter) {
+ std::pair<double, double> res;
+ if (!TryPairFromString(res, inputStr, delimiter)) {
+ ythrow TBadCastException() << "Wrong point string: " << inputStr;
+ }
+ return res;
+ }
+} // namespace NGeo
diff --git a/library/cpp/geo/util.h b/library/cpp/geo/util.h
new file mode 100644
index 0000000000..18b411e6a4
--- /dev/null
+++ b/library/cpp/geo/util.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include "point.h"
+#include "size.h"
+#include "window.h"
+
+#include <util/generic/ymath.h>
+
+namespace NGeo {
+ constexpr double MIN_LATITUDE = -90.;
+ constexpr double MAX_LATITUDE = +90.;
+ constexpr double MIN_LONGITUDE = -180.;
+ constexpr double MAX_LONGITUDE = +180.;
+ constexpr double WORLD_WIDTH = MAX_LONGITUDE - MIN_LONGITUDE;
+ constexpr double WORLD_HEIGHT = MAX_LATITUDE - MIN_LATITUDE;
+
+ // The Mercator projection is truncated at certain latitude so that the visible world forms a square. The poles are not shown.
+ constexpr double VISIBLE_LATITUDE_BOUND = 85.084059050109785;
+
+ inline double Deg2rad(double d) {
+ return d * PI / 180;
+ }
+
+ inline double Rad2deg(double d) {
+ return d * 180 / PI;
+ }
+
+ inline double GetLongitudeFromMetersAtEquator(double meters) {
+ return Rad2deg(meters * (1. / WGS84::R));
+ }
+
+ inline double GetMetersFromDeg(double angle) {
+ return Deg2rad(angle) * NGeo::WGS84::R;
+ }
+
+ inline double GetLatCos(double latDegree) {
+ return cos(Deg2rad(latDegree));
+ }
+
+ /**
+ * Get Inversed cosinus of latitude
+ * It is more precise, than division of two big doubles
+ * It is safe for lattitue at 90 degrees
+ */
+ inline double GetInversedLatCosSafe(double latDegree) {
+ return 1. / Max(0.001, cos(Deg2rad(latDegree)));
+ }
+
+ /**
+ * Gets Lontitude width for given width at equator and latitude
+ */
+ inline double GetWidthAtLatitude(double widthEquator, double latDegree) {
+ return widthEquator * GetInversedLatCosSafe(latDegree);
+ }
+
+ inline double GetWidthAtLatitude(double widthEquator, const TGeoPoint& p) {
+ return GetWidthAtLatitude(widthEquator, p.Lat());
+ }
+
+ /*
+ * Returns Normalised width at equator for specified width at latitude and latitude
+ */
+
+ inline double GetWidthAtEquator(double widthAtLatitude, double latDegree) {
+ return widthAtLatitude * GetLatCos(latDegree);
+ }
+
+ inline double GetWidthAtEquator(double widthAtLatitude, const TGeoPoint& p) {
+ return GetWidthAtEquator(widthAtLatitude, p.Lat());
+ }
+
+ /*
+ * Same for size
+ */
+
+ inline TSize GetSizeAtLatitude(const TSize& sizeAtEquator, const TGeoPoint& at) {
+ return TSize(GetWidthAtLatitude(sizeAtEquator.GetWidth(), at), sizeAtEquator.GetHeight());
+ }
+
+ inline TSize GetSizeAtEquator(const TSize& sizeAtLatitude, const TGeoPoint& at) {
+ return TSize(GetWidthAtEquator(sizeAtLatitude.GetWidth(), at), sizeAtLatitude.GetHeight());
+ }
+
+ inline TGeoWindow ConstructWindowFromEquatorSize(const TGeoPoint& center, const TSize& sizeAtEquator) {
+ return TGeoWindow(center, GetSizeAtLatitude(sizeAtEquator, center));
+ }
+
+ inline double SquaredDiagonal(const NGeo::TSize& size, double latitude) {
+ return Sqr(NGeo::GetWidthAtEquator(size.GetWidth(), latitude)) + Sqr(size.GetHeight());
+ }
+
+ inline double Diagonal(const NGeo::TSize& size, double latitude) {
+ return sqrt(SquaredDiagonal(size, latitude));
+ }
+
+ /**
+ * try to parse two coords from string
+ * return pair of coords on success, otherwise throw exception
+ */
+ std::pair<double, double> PairFromString(TStringBuf inputStr, TStringBuf delimiter = TStringBuf(","));
+
+ /**
+ * try to parse two coords from string
+ * write result to first param and return true on success, otherwise return false
+ */
+ bool TryPairFromString(std::pair<double, double>& res, TStringBuf inputStr, TStringBuf delimiter = TStringBuf(","));
+} // namespace NGeo
diff --git a/library/cpp/geo/window.cpp b/library/cpp/geo/window.cpp
new file mode 100644
index 0000000000..2ad2b61b71
--- /dev/null
+++ b/library/cpp/geo/window.cpp
@@ -0,0 +1,297 @@
+#include "window.h"
+
+#include "util.h"
+
+#include <util/generic/ylimits.h>
+#include <util/generic/ymath.h>
+#include <util/generic/maybe.h>
+
+#include <cstdlib>
+#include <utility>
+
+namespace NGeo {
+ namespace {
+ TMercatorPoint GetMiddlePoint(const TMercatorPoint& p1, const TMercatorPoint& p2) {
+ return TMercatorPoint{(p1.X() + p2.X()) / 2, (p1.Y() + p2.Y()) / 2};
+ }
+
+ struct TLatBounds {
+ double LatMin;
+ double LatMax;
+ };
+ } // namespace
+
+ bool TrySpan2LatitudeDegenerateCases(double ll, double lspan, TLatBounds& result) {
+ // TODO(sobols@): Compare with eps?
+ if (Y_UNLIKELY(lspan >= 180.)) {
+ result.LatMin = -90.;
+ result.LatMax = +90.;
+ return true;
+ }
+ if (Y_UNLIKELY(ll == +90.)) {
+ result.LatMin = ll - lspan;
+ result.LatMax = ll;
+ return true;
+ }
+ if (Y_UNLIKELY(ll == -90.)) {
+ result.LatMin = ll;
+ result.LatMax = ll + lspan;
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Finds such latitudes lmin, lmax that:
+ * 1) lmin <= ll <= lmax,
+ * 2) lmax - lmin == lspan,
+ * 3) MercatorY(ll) - MercatorY(lmin) == MercatorY(lmax) - MercatorY(ll)
+ * (the ll parallel is a center between lmin and lmax parallels in Mercator projection)
+ *
+ * \returns a pair (lmin, lmax)
+ */
+ TLatBounds Span2Latitude(double ll, double lspan) {
+ TLatBounds result{};
+ if (TrySpan2LatitudeDegenerateCases(ll, lspan, result)) {
+ return result;
+ }
+
+ const double lc = Deg2rad(ll);
+ const double h = Deg2rad(lspan);
+
+ // Spherical (Pseudo) Mercator:
+ // MercatorY(lc) = R * ln(tan(lc / 2 + PI / 4)).
+ // Note that
+ // ln(a) - ln(b) = ln(a / b)
+ // That'a why
+ // MercatorY(lc) - MercatorY(lmin) == MercatorY(lmin + h) - MercatorY(lc) <=>
+ // <=> tan(lc / 2 + PI / 4) / tan(lmin / 2 + PI / 4) ==
+ // == tan(lmin / 2 + h / 2 + PI / 4) / tan(lc / 2 + PI / 4).
+ // Also note that
+ // tan(x + y) == (tan(x) + tan(y)) / (1 - tan(x) * tan(y)),
+ // so
+ // tan(lmin / 2 + h / 2 + PI / 4) ==
+ // == (tan(lmin / 2 + PI / 4) + tan(h / 2)) / (1 - tan(lmin / 2 + PI / 4) * tan(h / 2))
+
+ const double yx = tan(lc / 2 + PI / 4);
+
+ // Let x be tan(lmin / 2 + PI / 4),
+ // then
+ // yx / x == (x + tan(h / 2)) / ((1 - x * tan(h / 2)) * yx),
+ // or
+ // yx^2 * (1 - x * tan(h / 2)) == (x + tan(h / 2)) * x.
+ // Now we solve a quadratic equation:
+ // x^2 + bx + c == 0
+
+ const double C = yx * yx;
+
+ const double b = (C + 1) * tan(h / 2), c = -C;
+ const double D = b * b - 4 * c;
+ const double root = (-b + sqrt(D)) / 2;
+
+ result.LatMin = Rad2deg((atan(root) - PI / 4) * 2);
+ result.LatMax = result.LatMin + lspan;
+ return result;
+ }
+
+ void TGeoWindow::CalcCorners() {
+ if (!IsValid()) {
+ return;
+ }
+ const TLatBounds latBounds = Span2Latitude(Center_.Lat(), Size_.GetHeight());
+
+ if (-90. < latBounds.LatMin && latBounds.LatMax < +90.) {
+ TMercatorPoint lowerLeftCornerM = LLToMercator(TGeoPoint(Center_.Lon() - (Size_.GetWidth() / 2), latBounds.LatMin));
+ TMercatorPoint upperRightCornerM = LLToMercator(TGeoPoint(Center_.Lon() + (Size_.GetWidth() / 2), latBounds.LatMax));
+ TMercatorPoint centerM = LLToMercator(Center_);
+
+ double w = upperRightCornerM.X() - lowerLeftCornerM.X();
+ double h = upperRightCornerM.Y() - lowerLeftCornerM.Y();
+
+ LowerLeftCorner_ = MercatorToLL(TMercatorPoint(centerM.X() - w / 2, centerM.Y() - h / 2));
+ UpperRightCorner_ = MercatorToLL(TMercatorPoint(centerM.X() + w / 2, centerM.Y() + h / 2));
+ } else {
+ LowerLeftCorner_ = TGeoPoint(Center_.Lon() - (Size_.GetWidth() / 2), latBounds.LatMin);
+ UpperRightCorner_ = TGeoPoint(Center_.Lon() + (Size_.GetWidth() / 2), latBounds.LatMax);
+ }
+ }
+
+ void TGeoWindow::CalcCenterAndSpan() {
+ if (!LowerLeftCorner_ || !UpperRightCorner_) {
+ return;
+ }
+
+ TMercatorPoint lower = LLToMercator(LowerLeftCorner_);
+ TMercatorPoint upper = LLToMercator(UpperRightCorner_);
+ TMercatorPoint center = GetMiddlePoint(lower, upper);
+ Center_ = MercatorToLL(center);
+
+ Size_ = TSize(UpperRightCorner_.Lon() - LowerLeftCorner_.Lon(),
+ UpperRightCorner_.Lat() - LowerLeftCorner_.Lat());
+ }
+
+ bool TGeoWindow::Contains(const TGeoPoint& p) const {
+ return LowerLeftCorner_.Lon() <= p.Lon() && p.Lon() <= UpperRightCorner_.Lon() &&
+ LowerLeftCorner_.Lat() <= p.Lat() && p.Lat() <= UpperRightCorner_.Lat();
+ }
+
+ double TGeoWindow::Diameter() const {
+ return Diagonal(Size_, Center_.Lat());
+ }
+
+ double TGeoWindow::Distance(const TGeoWindow& w) const {
+ const double minX = Max(GetLowerLeftCorner().Lon(), w.GetLowerLeftCorner().Lon());
+ const double maxX = Min(GetUpperRightCorner().Lon(), w.GetUpperRightCorner().Lon());
+ const double minY = Max(GetLowerLeftCorner().Lat(), w.GetLowerLeftCorner().Lat());
+ const double maxY = Min(GetUpperRightCorner().Lat(), w.GetUpperRightCorner().Lat());
+ double xGap = minX > maxX ? (minX - maxX) : 0.;
+ double yGap = minY > maxY ? (minY - maxY) : 0.;
+ return sqrtf(Sqr(xGap * cos((minY + maxY) * 0.5 * PI / 180)) + Sqr(yGap));
+ }
+
+ double TWindowLL::GetApproxDistance(const TPointLL& point) const {
+ const double metresInDegree = WGS84::R * PI / 180;
+ return Distance(TWindowLL{point, point}) * metresInDegree;
+ }
+
+ TGeoWindow TGeoWindow::ParseFromCornersPoints(TStringBuf leftCornerStr, TStringBuf rightCornerStr, TStringBuf delimiter) {
+ auto leftCorner = TGeoPoint::Parse(leftCornerStr, delimiter);
+ auto rightCorner = TGeoPoint::Parse(rightCornerStr, delimiter);
+
+ return {leftCorner, rightCorner};
+ }
+
+ TMaybe<TGeoWindow> TGeoWindow::TryParseFromCornersPoints(TStringBuf leftCornerStr, TStringBuf rightCornerStr, TStringBuf delimiter) {
+ auto leftCorner = TGeoPoint::TryParse(leftCornerStr, delimiter);
+ auto rightCorner = TGeoPoint::TryParse(rightCornerStr, delimiter);
+ if (!leftCorner || !rightCorner) {
+ return {};
+ }
+
+ return TGeoWindow{*leftCorner, *rightCorner};
+ }
+
+ TGeoWindow TGeoWindow::ParseFromLlAndSpn(TStringBuf llStr, TStringBuf spnStr, TStringBuf delimiter) {
+ TGeoPoint ll = TGeoPoint::Parse(llStr, delimiter);
+ TSize spn = TSize::Parse(spnStr, delimiter);
+
+ return {ll, spn};
+ }
+
+ TMaybe<TGeoWindow> TGeoWindow::TryParseFromLlAndSpn(TStringBuf llStr, TStringBuf spnStr, TStringBuf delimiter) {
+ auto ll = TGeoPoint::TryParse(llStr, delimiter);
+ auto spn = TSize::TryParse(spnStr, delimiter);
+
+ if (!ll || !spn) {
+ return {};
+ }
+
+ return TGeoWindow{*ll, *spn};
+ }
+ /**
+ * TMercatorWindow
+ */
+
+ TMercatorWindow::TMercatorWindow() noexcept
+ : HalfWidth_{std::numeric_limits<double>::quiet_NaN()}
+ , HalfHeight_{std::numeric_limits<double>::quiet_NaN()}
+ {
+ }
+
+ TMercatorWindow::TMercatorWindow(const TMercatorPoint& center, const TSize& size) noexcept
+ : Center_{center}
+ , HalfWidth_{size.GetWidth() / 2}
+ , HalfHeight_{size.GetHeight() / 2}
+ {
+ }
+
+ TMercatorWindow::TMercatorWindow(const TMercatorPoint& firstPoint, const TMercatorPoint& secondPoint) noexcept
+ : Center_{GetMiddlePoint(firstPoint, secondPoint)}
+ , HalfWidth_{Abs(secondPoint.X() - firstPoint.X()) / 2}
+ , HalfHeight_{Abs(secondPoint.Y() - firstPoint.Y()) / 2}
+ {
+ }
+
+ bool TMercatorWindow::Contains(const TMercatorPoint& pt) const noexcept {
+ return (Center_.X() - HalfWidth_ <= pt.X()) &&
+ (pt.X() <= Center_.X() + HalfWidth_) &&
+ (Center_.Y() - HalfHeight_ <= pt.Y()) &&
+ (pt.Y() <= Center_.Y() + HalfHeight_);
+ }
+
+ /**
+ * Conversion
+ */
+
+ TMercatorWindow LLToMercator(const TGeoWindow& window) {
+ return TMercatorWindow{LLToMercator(window.GetLowerLeftCorner()), LLToMercator(window.GetUpperRightCorner())};
+ }
+
+ TGeoWindow MercatorToLL(const TMercatorWindow& window) {
+ return TGeoWindow{MercatorToLL(window.GetLowerLeftCorner()), MercatorToLL(window.GetUpperRightCorner())};
+ }
+
+ /**
+ * Operators
+ */
+
+ TMaybe<TGeoWindow> Intersection(const TGeoWindow& lhs, const TGeoWindow& rhs) {
+ const double minX = Max(lhs.GetLowerLeftCorner().Lon(), rhs.GetLowerLeftCorner().Lon());
+ const double maxX = Min(lhs.GetUpperRightCorner().Lon(), rhs.GetUpperRightCorner().Lon());
+ const double minY = Max(lhs.GetLowerLeftCorner().Lat(), rhs.GetLowerLeftCorner().Lat());
+ const double maxY = Min(lhs.GetUpperRightCorner().Lat(), rhs.GetUpperRightCorner().Lat());
+ if (minX > maxX || minY > maxY) {
+ return {};
+ }
+ return TGeoWindow(TGeoPoint(minX, minY), TGeoPoint(maxX, maxY));
+ }
+
+ TMaybe<TGeoWindow> Intersection(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs) {
+ if (!lhs || !rhs) {
+ return {};
+ }
+ return Intersection(*lhs, *rhs);
+ }
+
+ TGeoWindow Union(const TGeoWindow& lhs, const TGeoWindow& rhs) {
+ const double minX = Min(lhs.GetLowerLeftCorner().Lon(), rhs.GetLowerLeftCorner().Lon());
+ const double maxX = Max(lhs.GetUpperRightCorner().Lon(), rhs.GetUpperRightCorner().Lon());
+ const double minY = Min(lhs.GetLowerLeftCorner().Lat(), rhs.GetLowerLeftCorner().Lat());
+ const double maxY = Max(lhs.GetUpperRightCorner().Lat(), rhs.GetUpperRightCorner().Lat());
+ return TGeoWindow{TGeoPoint{minX, minY}, TGeoPoint{maxX, maxY}};
+ }
+
+ TMaybe<TGeoWindow> Union(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs) {
+ if (!lhs) {
+ return rhs;
+ }
+ if (!rhs) {
+ return lhs;
+ }
+ return Union(*lhs, *rhs);
+ }
+
+ bool Contains(const TMaybe<TGeoWindow>& window, const TGeoPoint& point) {
+ if (!window) {
+ return false;
+ }
+ return window.GetRef().Contains(point);
+ }
+
+ bool Intersects(const TGeoWindow& lhs, const TGeoWindow& rhs) {
+ bool haveHorizIntersection =
+ !(lhs.GetUpperRightCorner().Lon() <= rhs.GetLowerLeftCorner().Lon() ||
+ rhs.GetUpperRightCorner().Lon() <= lhs.GetLowerLeftCorner().Lon());
+ bool haveVertIntersection =
+ !(lhs.GetUpperRightCorner().Lat() <= rhs.GetLowerLeftCorner().Lat() ||
+ rhs.GetUpperRightCorner().Lat() <= lhs.GetLowerLeftCorner().Lat());
+ return haveHorizIntersection && haveVertIntersection;
+ }
+
+ bool Intersects(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs) {
+ if (!lhs || !rhs) {
+ return false;
+ }
+ return Intersects(*lhs, *rhs);
+ }
+} // namespace NGeo
diff --git a/library/cpp/geo/window.h b/library/cpp/geo/window.h
new file mode 100644
index 0000000000..1205d8351b
--- /dev/null
+++ b/library/cpp/geo/window.h
@@ -0,0 +1,264 @@
+#pragma once
+
+#include "point.h"
+#include "size.h"
+#include <util/generic/string.h>
+#include <util/generic/yexception.h>
+#include <util/string/cast.h>
+#include <util/generic/maybe.h>
+
+#include <algorithm>
+
+namespace NGeo {
+ class TGeoWindow {
+ public:
+ TGeoWindow() noexcept
+
+ = default;
+
+ TGeoWindow(const TGeoPoint& center, const TSize& size) noexcept
+ : Center_(center)
+ , Size_(size)
+ {
+ CalcCorners();
+ }
+
+ TGeoWindow(const TGeoPoint& firstPoint, const TGeoPoint& secondPoint) noexcept
+ : LowerLeftCorner_{std::min(firstPoint.Lon(), secondPoint.Lon()),
+ std::min(firstPoint.Lat(), secondPoint.Lat())}
+ , UpperRightCorner_{std::max(firstPoint.Lon(), secondPoint.Lon()),
+ std::max(firstPoint.Lat(), secondPoint.Lat())}
+ {
+ CalcCenterAndSpan();
+ }
+
+ const TGeoPoint& GetCenter() const noexcept {
+ return Center_;
+ }
+
+ void SetCenter(const TGeoPoint& newCenter) {
+ Center_ = newCenter;
+ CalcCorners();
+ }
+
+ const TSize& GetSize() const noexcept {
+ return Size_;
+ }
+
+ void SetSize(const TSize& newSize) {
+ Size_ = newSize;
+ CalcCorners();
+ }
+
+ const TGeoPoint& GetLowerLeftCorner() const noexcept {
+ return LowerLeftCorner_;
+ }
+
+ const TGeoPoint& GetUpperRightCorner() const noexcept {
+ return UpperRightCorner_;
+ }
+
+ void swap(TGeoWindow& o) noexcept {
+ Center_.swap(o.Center_);
+ Size_.swap(o.Size_);
+ LowerLeftCorner_.swap(o.LowerLeftCorner_);
+ UpperRightCorner_.swap(o.UpperRightCorner_);
+ }
+
+ bool IsValid() const noexcept {
+ return Center_.IsValid() && Size_.IsValid();
+ }
+
+ bool Contains(const TGeoPoint&) const;
+
+ bool Contains(const TGeoWindow& w) const {
+ return Contains(w.LowerLeftCorner_) && Contains(w.UpperRightCorner_);
+ }
+
+ void Stretch(double multiplier) {
+ Size_.Stretch(multiplier);
+ CalcCorners();
+ }
+
+ void Inflate(double additionX, double additionY) {
+ Size_.Inflate(additionX * 2, additionY * 2);
+ CalcCorners();
+ }
+
+ void Inflate(double addition) {
+ Inflate(addition, addition);
+ }
+
+ bool operator!() const {
+ return !IsValid();
+ }
+
+ double Diameter() const;
+
+ double Area() const {
+ return Size_.GetHeight() * Size_.GetWidth();
+ }
+
+ double Distance(const TGeoWindow&) const;
+
+ double GetApproxDistance(const TPointLL& point) const;
+
+ /**
+ * try to parse TGeoWindow from center and span
+ * return parsed TGeoWindow on success, otherwise throw exception
+ */
+ static TGeoWindow ParseFromLlAndSpn(TStringBuf llStr, TStringBuf spnStr, TStringBuf delimiter = TStringBuf(","));
+
+ /**
+ * try to parse TGeoWindow from two corners
+ * return parsed TGeoWindow on success, otherwise throw exception
+ */
+ static TGeoWindow ParseFromCornersPoints(TStringBuf leftCornerStr, TStringBuf rightCornerStr, TStringBuf delimiter = TStringBuf(","));
+
+ /**
+ * try to parse TGeoWindow from center and span
+ * return TMaybe of parsed TGeoWindow on success, otherwise return empty TMaybe
+ */
+ static TMaybe<TGeoWindow> TryParseFromLlAndSpn(TStringBuf llStr, TStringBuf spnStr, TStringBuf delimiter = TStringBuf(","));
+
+ /**
+ * try to parse TGeoWindow from two corners
+ * return TMaybe of parsed TGeoWindow on success, otherwise return empty TMaybe
+ */
+ static TMaybe<TGeoWindow> TryParseFromCornersPoints(TStringBuf leftCornerStr, TStringBuf rightCornerStr, TStringBuf delimiter = TStringBuf(","));
+
+ private:
+ TGeoPoint Center_;
+ TSize Size_;
+ TGeoPoint LowerLeftCorner_;
+ TGeoPoint UpperRightCorner_;
+
+ void CalcCorners();
+ void CalcCenterAndSpan();
+ };
+
+ inline bool operator==(const TGeoWindow& lhs, const TGeoWindow& rhs) {
+ return lhs.GetCenter() == rhs.GetCenter() && lhs.GetSize() == rhs.GetSize();
+ }
+
+ inline bool operator!=(const TGeoWindow& p1, const TGeoWindow& p2) {
+ return !(p1 == p2);
+ }
+
+ /**
+ * \class TMercatorWindow
+ *
+ * Represents a window in EPSG:3395 projection
+ * (WGS 84 / World Mercator)
+ */
+ class TMercatorWindow {
+ public:
+ TMercatorWindow() noexcept;
+ TMercatorWindow(const TMercatorPoint& center, const TSize& size) noexcept;
+ TMercatorWindow(const TMercatorPoint& firstPoint, const TMercatorPoint& secondPoint) noexcept;
+
+ const TMercatorPoint& GetCenter() const noexcept {
+ return Center_;
+ }
+
+ TSize GetHalfSize() const noexcept {
+ return {HalfWidth_, HalfHeight_};
+ }
+
+ TSize GetSize() const noexcept {
+ return {GetWidth(), GetHeight()};
+ }
+
+ double GetWidth() const noexcept {
+ return HalfWidth_ * 2;
+ }
+
+ double GetHeight() const noexcept {
+ return HalfHeight_ * 2;
+ }
+
+ TMercatorPoint GetLowerLeftCorner() const noexcept {
+ return TMercatorPoint{Center_.X() - HalfWidth_, Center_.Y() - HalfHeight_};
+ }
+
+ TMercatorPoint GetUpperRightCorner() const noexcept {
+ return TMercatorPoint{Center_.X() + HalfWidth_, Center_.Y() + HalfHeight_};
+ }
+
+ bool Contains(const TMercatorPoint& pt) const noexcept;
+
+ bool Contains(const TMercatorWindow& w) const {
+ return Contains(w.GetLowerLeftCorner()) && Contains(w.GetUpperRightCorner());
+ }
+
+ void Stretch(double multiplier) {
+ HalfWidth_ *= multiplier;
+ HalfHeight_ *= multiplier;
+ }
+
+ void Inflate(double additionX, double additionY) {
+ HalfWidth_ += additionX;
+ HalfHeight_ += additionY;
+ }
+
+ void Inflate(double addition) {
+ Inflate(addition, addition);
+ }
+
+ double Area() const {
+ return GetHeight() * GetWidth();
+ }
+
+ private:
+ bool IsDefined() const {
+ return Center_.IsDefined() && !std::isnan(HalfWidth_) && !std::isnan(HalfHeight_);
+ }
+
+ private:
+ TMercatorPoint Center_;
+ double HalfWidth_;
+ double HalfHeight_;
+ };
+
+ inline bool operator==(const TMercatorWindow& lhs, const TMercatorWindow& rhs) {
+ return lhs.GetCenter() == rhs.GetCenter() && lhs.GetHalfSize() == rhs.GetHalfSize();
+ }
+
+ inline bool operator!=(const TMercatorWindow& p1, const TMercatorWindow& p2) {
+ return !(p1 == p2);
+ }
+
+ /**
+ * Typedefs
+ * TODO(sobols@): remove
+ */
+
+ using TWindowLL = TGeoWindow;
+
+ /**
+ * Conversion
+ */
+
+ TMercatorWindow LLToMercator(const TGeoWindow&);
+ TGeoWindow MercatorToLL(const TMercatorWindow&);
+
+ /**
+ * Utility functions
+ */
+
+ bool Contains(const TMaybe<TGeoWindow>& window, const TGeoPoint& point);
+
+ TMaybe<TGeoWindow> Union(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs);
+ TGeoWindow Union(const TGeoWindow& lhs, const TGeoWindow& rhs);
+
+ TMaybe<TGeoWindow> Intersection(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs);
+ TMaybe<TGeoWindow> Intersection(const TGeoWindow& lhs, const TGeoWindow& rhs);
+
+ bool Intersects(const TGeoWindow& lhs, const TGeoWindow& rhs);
+ bool Intersects(const TMaybe<TGeoWindow>& lhs, const TMaybe<TGeoWindow>& rhs);
+} // namespace NGeo
+
+template <>
+inline void Out<NGeo::TGeoWindow>(IOutputStream& o, const NGeo::TGeoWindow& obj) {
+ o << '{' << obj.GetCenter() << ", " << obj.GetSize() << ", " << obj.GetLowerLeftCorner() << ", " << obj.GetUpperRightCorner() << "}";
+}
diff --git a/library/cpp/geo/ya.make b/library/cpp/geo/ya.make
new file mode 100644
index 0000000000..1d36003c5c
--- /dev/null
+++ b/library/cpp/geo/ya.make
@@ -0,0 +1,19 @@
+LIBRARY()
+
+SRCS(
+ bbox.cpp
+ geo.cpp
+ point.cpp
+ polygon.cpp
+ load_save_helper.cpp
+ size.cpp
+ util.cpp
+ window.cpp
+)
+
+END()
+
+RECURSE_FOR_TESTS(
+ ut
+ style
+ )
diff --git a/library/cpp/geobase/CMakeLists.darwin-x86_64.txt b/library/cpp/geobase/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..b316e54e8a
--- /dev/null
+++ b/library/cpp/geobase/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,30 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-geobase)
+target_link_libraries(library-cpp-geobase PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ geobase-library
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-geobase PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geobase/geobase.cpp
+)
+generate_enum_serilization(library-cpp-geobase
+ ${CMAKE_SOURCE_DIR}/geobase/include/structs.hpp
+ INCLUDE_HEADERS
+ geobase/include/structs.hpp
+)
diff --git a/library/cpp/geobase/CMakeLists.linux-aarch64.txt b/library/cpp/geobase/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..ab3962970d
--- /dev/null
+++ b/library/cpp/geobase/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,31 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-geobase)
+target_link_libraries(library-cpp-geobase PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ geobase-library
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-geobase PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geobase/geobase.cpp
+)
+generate_enum_serilization(library-cpp-geobase
+ ${CMAKE_SOURCE_DIR}/geobase/include/structs.hpp
+ INCLUDE_HEADERS
+ geobase/include/structs.hpp
+)
diff --git a/library/cpp/geobase/CMakeLists.linux-x86_64.txt b/library/cpp/geobase/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..ab3962970d
--- /dev/null
+++ b/library/cpp/geobase/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,31 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-geobase)
+target_link_libraries(library-cpp-geobase PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ geobase-library
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-geobase PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geobase/geobase.cpp
+)
+generate_enum_serilization(library-cpp-geobase
+ ${CMAKE_SOURCE_DIR}/geobase/include/structs.hpp
+ INCLUDE_HEADERS
+ geobase/include/structs.hpp
+)
diff --git a/library/cpp/geobase/CMakeLists.txt b/library/cpp/geobase/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/geobase/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/geobase/CMakeLists.windows-x86_64.txt b/library/cpp/geobase/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..b316e54e8a
--- /dev/null
+++ b/library/cpp/geobase/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,30 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-geobase)
+target_link_libraries(library-cpp-geobase PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ geobase-library
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-geobase PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geobase/geobase.cpp
+)
+generate_enum_serilization(library-cpp-geobase
+ ${CMAKE_SOURCE_DIR}/geobase/include/structs.hpp
+ INCLUDE_HEADERS
+ geobase/include/structs.hpp
+)
diff --git a/library/cpp/geobase/geobase.cpp b/library/cpp/geobase/geobase.cpp
new file mode 100644
index 0000000000..24086c67a9
--- /dev/null
+++ b/library/cpp/geobase/geobase.cpp
@@ -0,0 +1,3 @@
+#include <library/cpp/geobase/lookup.hpp>
+#include <library/cpp/geobase/timezone_getter.hpp>
+#include <library/cpp/geobase/service_getter.hpp>
diff --git a/library/cpp/geobase/lookup.hpp b/library/cpp/geobase/lookup.hpp
new file mode 100644
index 0000000000..f663750ab2
--- /dev/null
+++ b/library/cpp/geobase/lookup.hpp
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <geobase/include/lookup.hpp>
+#include <geobase/include/lookup_wrapper.hpp>
+#include <geobase/include/structs.hpp>
+
+namespace NGeobase {
+ using TInitTraits = NImpl::TLookup::TInitTraits;
+
+ class TLookup: public NImpl::TLookup {
+ public:
+ using parent = NImpl::TLookup;
+
+ explicit TLookup(const std::string& datafile, const TInitTraits traits = {})
+ : parent(datafile, traits)
+ {
+ }
+ explicit TLookup(const TInitTraits traits)
+ : parent(traits)
+ {
+ }
+ explicit TLookup(const void* pData, size_t len)
+ : parent(pData, len)
+ {
+ }
+
+ ~TLookup() {
+ }
+ };
+
+ using TRegion = NImpl::TRegion;
+ using TGeolocation = NImpl::TGeolocation;
+ using TLinguistics = NImpl::TLinguistics;
+ using TGeoPoint = NImpl::TGeoPoint;
+
+ using TLookupWrapper = NImpl::TLookupWrapper;
+
+ using TId = NImpl::Id;
+ using TIdsList = NImpl::IdsList;
+ using TRegionsList = NImpl::TRegionsList;
+
+ using TIpBasicTraits = NImpl::TIpBasicTraits;
+ using TIpTraits = NImpl::TIpTraits;
+}
diff --git a/library/cpp/geobase/service_getter.hpp b/library/cpp/geobase/service_getter.hpp
new file mode 100644
index 0000000000..e088081706
--- /dev/null
+++ b/library/cpp/geobase/service_getter.hpp
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <geobase/include/service_getter.hpp>
+
+namespace NGeobase {
+ using TServiceGetter = NImpl::TServiceGetter;
+}
diff --git a/library/cpp/geobase/timezone_getter.hpp b/library/cpp/geobase/timezone_getter.hpp
new file mode 100644
index 0000000000..5749f1e3d6
--- /dev/null
+++ b/library/cpp/geobase/timezone_getter.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <geobase/include/timezone_getter.hpp>
+#include <geobase/include/structs.hpp>
+
+namespace NGeobase {
+ using TTimezone = NImpl::TTimezone;
+ using TTimezoneGetter = NImpl::TTimezoneGetter;
+}
diff --git a/library/cpp/geobase/ya.make b/library/cpp/geobase/ya.make
new file mode 100644
index 0000000000..4a73974903
--- /dev/null
+++ b/library/cpp/geobase/ya.make
@@ -0,0 +1,13 @@
+LIBRARY()
+
+SRCS(
+ library/cpp/geobase/geobase.cpp
+)
+
+PEERDIR(
+ geobase/library
+)
+
+GENERATE_ENUM_SERIALIZATION(geobase/include/structs.hpp)
+
+END()
diff --git a/library/cpp/geohash/CMakeLists.darwin-x86_64.txt b/library/cpp/geohash/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..dfcb278a1f
--- /dev/null
+++ b/library/cpp/geohash/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,32 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-geohash)
+target_link_libraries(library-cpp-geohash PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-geo
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-geohash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geohash/geohash.cpp
+)
+generate_enum_serilization(library-cpp-geohash
+ ${CMAKE_SOURCE_DIR}/library/cpp/geohash/direction.h
+ GEN_HEADER
+ ${CMAKE_BINARY_DIR}/library/cpp/geohash/direction.h_serialized.h
+ INCLUDE_HEADERS
+ library/cpp/geohash/direction.h
+)
diff --git a/library/cpp/geohash/CMakeLists.linux-aarch64.txt b/library/cpp/geohash/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..a907311df0
--- /dev/null
+++ b/library/cpp/geohash/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,33 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-geohash)
+target_link_libraries(library-cpp-geohash PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-geo
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-geohash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geohash/geohash.cpp
+)
+generate_enum_serilization(library-cpp-geohash
+ ${CMAKE_SOURCE_DIR}/library/cpp/geohash/direction.h
+ GEN_HEADER
+ ${CMAKE_BINARY_DIR}/library/cpp/geohash/direction.h_serialized.h
+ INCLUDE_HEADERS
+ library/cpp/geohash/direction.h
+)
diff --git a/library/cpp/geohash/CMakeLists.linux-x86_64.txt b/library/cpp/geohash/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..a907311df0
--- /dev/null
+++ b/library/cpp/geohash/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,33 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-geohash)
+target_link_libraries(library-cpp-geohash PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-geo
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-geohash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geohash/geohash.cpp
+)
+generate_enum_serilization(library-cpp-geohash
+ ${CMAKE_SOURCE_DIR}/library/cpp/geohash/direction.h
+ GEN_HEADER
+ ${CMAKE_BINARY_DIR}/library/cpp/geohash/direction.h_serialized.h
+ INCLUDE_HEADERS
+ library/cpp/geohash/direction.h
+)
diff --git a/library/cpp/geohash/CMakeLists.txt b/library/cpp/geohash/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/geohash/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/geohash/CMakeLists.windows-x86_64.txt b/library/cpp/geohash/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..dfcb278a1f
--- /dev/null
+++ b/library/cpp/geohash/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,32 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-geohash)
+target_link_libraries(library-cpp-geohash PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-geo
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-geohash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/geohash/geohash.cpp
+)
+generate_enum_serilization(library-cpp-geohash
+ ${CMAKE_SOURCE_DIR}/library/cpp/geohash/direction.h
+ GEN_HEADER
+ ${CMAKE_BINARY_DIR}/library/cpp/geohash/direction.h_serialized.h
+ INCLUDE_HEADERS
+ library/cpp/geohash/direction.h
+)
diff --git a/library/cpp/geohash/direction.h b/library/cpp/geohash/direction.h
new file mode 100644
index 0000000000..88a3e6061d
--- /dev/null
+++ b/library/cpp/geohash/direction.h
@@ -0,0 +1,14 @@
+#pragma once
+
+namespace NGeoHash {
+ enum EDirection {
+ NORTH = 0,
+ NORTH_EAST,
+ EAST,
+ SOUTH_EAST,
+ SOUTH,
+ SOUTH_WEST,
+ WEST,
+ NORTH_WEST,
+ };
+}
diff --git a/library/cpp/geohash/geohash.cpp b/library/cpp/geohash/geohash.cpp
new file mode 100644
index 0000000000..6c6d65acab
--- /dev/null
+++ b/library/cpp/geohash/geohash.cpp
@@ -0,0 +1,413 @@
+#include "geohash.h"
+
+#include <util/generic/xrange.h>
+
+namespace {
+ using TNeighbourDescriptors = NGeoHash::TNeighbours<TMaybe<NGeoHash::TGeoHashDescriptor>>;
+ const auto directions = GetEnumAllValues<NGeoHash::EDirection>();
+
+ const auto doubleEps = std::numeric_limits<double>::epsilon();
+
+ const NGeoHash::TBoundingBoxLL& GetGlobalBBox() {
+ static const NGeoHash::TBoundingBoxLL globalLimits({-180, -90}, {180, 90});
+ return globalLimits;
+ }
+
+ const TStringBuf base32EncodeTable = "0123456789bcdefghjkmnpqrstuvwxyz";
+
+ const ui64 base32DecodeMask = 0x1F;
+ constexpr int base32DecodeTableSize = 128;
+
+ using TBase32DecodeTable = std::array<TMaybe<i8>, base32DecodeTableSize>;
+
+ TBase32DecodeTable MakeBase32DecodeTable() {
+ TBase32DecodeTable result;
+ result.fill(Nothing());
+ for (auto i : xrange(base32EncodeTable.size())) {
+ result[base32EncodeTable[i]] = i;
+ }
+ return result;
+ }
+
+ const TBase32DecodeTable base32DecodeTable = MakeBase32DecodeTable();
+}
+
+namespace NGeoHash {
+ static const ui8 maxSteps = 62;
+ static const ui8 maxPrecision = TGeoHashDescriptor::StepsToPrecision(maxSteps); // 12
+
+ static const TNeighbours<std::pair<i8, i8>> neighborBitMoves = {
+ {1, 0}, // NORTH
+ {1, 1},
+ {0, 1},
+ {-1, 1},
+ {-1, 0},
+ {-1, -1},
+ {0, -1},
+ {1, -1},
+ };
+
+ ui8 TGeoHashDescriptor::StepsToPrecision(ui8 steps) {
+ return steps / StepsPerPrecisionUnit;
+ }
+
+ ui8 TGeoHashDescriptor::PrecisionToSteps(ui8 precision) {
+ return precision * StepsPerPrecisionUnit;
+ }
+
+ /* Steps interleave starting from lon so for 5 steps 3 are lon-steps and 2 are lat-steps.
+ * Thus there are ceil(step/2) lon-steps and floor(step/2) lat-steps */
+ std::pair<ui8, ui8> TGeoHashDescriptor::LatLonSteps() const {
+ return std::make_pair<ui8, ui8>(Steps / 2, (Steps + 1) / 2);
+ }
+
+ struct TMagicNumber {
+ ui64 Mask;
+ ui8 Shift;
+ };
+
+ /* Interleave lower bits of x and y, so the bits of x
+ * are in the even positions and bits from y in the odd.
+ * e.g. Interleave64(0b101, 0b110) => 0b111001
+ * From: https://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN
+ */
+ ui64 TGeoHashDescriptor::Interleave64(ui32 x, ui32 y) {
+ // attention: magic numbers
+ constexpr TMagicNumber mortonMagicNumbers[] = {
+ {0x0000FFFF0000FFFF, 16},
+ {0x00FF00FF00FF00FF, 8},
+ {0x0F0F0F0F0F0F0F0F, 4},
+ {0x3333333333333333, 2},
+ {0x5555555555555555, 1}};
+
+ ui64 x64 = x;
+ ui64 y64 = y;
+
+ for (const auto& magicNumber : mortonMagicNumbers) {
+ x64 = (x64 | (x64 << magicNumber.Shift)) & magicNumber.Mask;
+ y64 = (y64 | (y64 << magicNumber.Shift)) & magicNumber.Mask;
+ }
+ return x64 | (y64 << 1);
+ }
+
+ /* Reverse the interleave process
+ * Deinterleave64(0b111001) => 0b101110
+ * derived from http://stackoverflow.com/questions/4909263 */
+ std::pair<ui32, ui32> TGeoHashDescriptor::Deinterleave64(ui64 z) {
+ constexpr TMagicNumber demortonMagicNumbers[] = {
+ {0x5555555555555555ULL, 0},
+ {0x3333333333333333ULL, 1},
+ {0x0F0F0F0F0F0F0F0FULL, 2},
+ {0x00FF00FF00FF00FFULL, 4},
+ {0x0000FFFF0000FFFFULL, 8},
+ {0x00000000FFFFFFFFULL, 16}};
+
+ ui64 x = z;
+ ui64 y = z >> 1;
+
+ for (const auto& magicNumber : demortonMagicNumbers) {
+ x = (x | (x >> magicNumber.Shift)) & magicNumber.Mask;
+ y = (y | (y >> magicNumber.Shift)) & magicNumber.Mask;
+ }
+
+ return std::make_pair(x, y);
+ }
+
+ std::pair<ui32, ui32> TGeoHashDescriptor::LatLonBits() const {
+ auto deinterleaved = Deinterleave64(Bits);
+
+ if (Steps % 2) {
+ DoSwap(deinterleaved.first, deinterleaved.second);
+ }
+ return deinterleaved;
+ }
+
+ void TGeoHashDescriptor::SetLatLonBits(ui32 latBits, ui32 lonBits) {
+ if (Steps % 2) {
+ Bits = Interleave64(lonBits, latBits);
+ } else {
+ Bits = Interleave64(latBits, lonBits);
+ }
+ }
+
+ void TGeoHashDescriptor::InitFromLatLon(double latitude, double longitude, const TBoundingBoxLL& limits, ui8 steps) {
+ Steps = steps;
+ if (Steps > maxSteps) {
+ ythrow yexception() << "Invalid steps: available values: 0.." << ::ToString(maxSteps);
+ }
+
+ if (limits.Width() < doubleEps || limits.Height() < doubleEps) {
+ ythrow yexception() << "Invalid limits: min/max for one of coordinates are equal";
+ }
+
+ if (latitude < limits.GetMinY() || latitude > limits.GetMaxY() || longitude < limits.GetMinX() || longitude > limits.GetMaxX()) {
+ ythrow yexception() << "Invalid point (" << latitude << ", " << longitude << "): outside of limits";
+ }
+
+ double lat01 = (latitude - limits.GetMinY()) / limits.Height();
+ double lon01 = (longitude - limits.GetMinX()) / limits.Width();
+
+ auto llSteps = LatLonSteps();
+
+ /* convert to fixed point based on the step size */
+ lat01 *= (1 << llSteps.first);
+ lon01 *= (1 << llSteps.second);
+
+ /* If lon_steps > lat_step, last bit is lon-bit, otherwise last bit is lat-bit*/
+ SetLatLonBits(lat01, lon01);
+ }
+
+ TGeoHashDescriptor::TGeoHashDescriptor(double latitude, double longitude, const TBoundingBoxLL& limits, ui8 steps) {
+ InitFromLatLon(latitude, longitude, limits, steps);
+ }
+
+ TGeoHashDescriptor::TGeoHashDescriptor(double latitude, double longitude, ui8 steps) {
+ InitFromLatLon(latitude, longitude, GetGlobalBBox(), steps);
+ }
+
+ TGeoHashDescriptor::TGeoHashDescriptor(const NGeo::TPointLL& point, const TBoundingBoxLL& limits, ui8 steps) {
+ InitFromLatLon(point.Lat(), point.Lon(), limits, steps);
+ }
+
+ TGeoHashDescriptor::TGeoHashDescriptor(const NGeo::TPointLL& point, ui8 steps) {
+ InitFromLatLon(point.Lat(), point.Lon(), GetGlobalBBox(), steps);
+ }
+
+ TGeoHashDescriptor::TGeoHashDescriptor(const TString& hashString) {
+ if (hashString.size() > maxPrecision) {
+ ythrow yexception() << "hashString is too long: max length is " << ::ToString(maxPrecision);
+ }
+
+ Bits = 0;
+ for (auto c : hashString) {
+ Bits <<= StepsPerPrecisionUnit;
+ Y_ENSURE(c >= 0);
+ const auto decodedChar = base32DecodeTable[c];
+ Y_ENSURE(decodedChar.Defined());
+ Bits |= decodedChar.GetRef();
+ }
+
+ Steps = PrecisionToSteps(hashString.size());
+ }
+
+ ui64 TGeoHashDescriptor::GetBits() const {
+ return Bits;
+ }
+
+ ui8 TGeoHashDescriptor::GetSteps() const {
+ return Steps;
+ }
+
+ TString TGeoHashDescriptor::ToString() const {
+ auto precision = StepsToPrecision(Steps);
+
+ TStringStream stream;
+
+ auto bits = Bits;
+ auto activeSteps = PrecisionToSteps(precision);
+
+ bits >>= (Steps - activeSteps);
+ for (auto i : xrange(precision)) {
+ auto ix = (bits >> (StepsPerPrecisionUnit * ((precision - i - 1)))) & base32DecodeMask;
+ stream << base32EncodeTable[ix];
+ }
+
+ return stream.Str();
+ }
+
+ TBoundingBoxLL TGeoHashDescriptor::ToBoundingBox(const TBoundingBoxLL& limits) const {
+ auto llBits = LatLonBits();
+ auto llSteps = LatLonSteps();
+
+ double latMultiplier = limits.Height() / (1ull << llSteps.first);
+ double lonMultiplier = limits.Width() / (1ull << llSteps.second);
+
+ return {
+ {
+ limits.GetMinX() + lonMultiplier * llBits.second,
+ limits.GetMinY() + latMultiplier * llBits.first,
+ },
+ {
+ limits.GetMinX() + lonMultiplier * (llBits.second + 1),
+ limits.GetMinY() + latMultiplier * (llBits.first + 1),
+ }};
+ }
+
+ TBoundingBoxLL TGeoHashDescriptor::ToBoundingBox() const {
+ return ToBoundingBox(GetGlobalBBox());
+ }
+
+ NGeo::TPointLL TGeoHashDescriptor::ToPoint(const TBoundingBoxLL& limits) const {
+ auto boundingBox = ToBoundingBox(limits);
+ return {
+ boundingBox.GetMinX() + boundingBox.Width() / 2,
+ boundingBox.GetMinY() + boundingBox.Height() / 2};
+ }
+
+ NGeo::TPointLL TGeoHashDescriptor::ToPoint() const {
+ return ToPoint(GetGlobalBBox());
+ }
+
+ TMaybe<TGeoHashDescriptor> TGeoHashDescriptor::GetNeighbour(EDirection direction) const {
+ TGeoHashDescriptor result(0, Steps);
+ auto llBits = LatLonBits();
+ auto llSteps = LatLonSteps();
+ std::pair<i8, i8> bitMove = neighborBitMoves[direction];
+
+ auto newLatBits = llBits.first + bitMove.first;
+ auto newLonBits = llBits.second + bitMove.second;
+
+ // Overflow in lat means polar, so return Nothing
+ if (newLatBits >> llSteps.first != 0) {
+ return Nothing();
+ }
+
+ // Overflow in lon means 180-meridian, so just remove overflowed bits
+ newLonBits &= ((1 << llSteps.second) - 1);
+ result.SetLatLonBits(newLatBits, newLonBits);
+ return result;
+ }
+
+ TNeighbourDescriptors TGeoHashDescriptor::GetNeighbours() const {
+ TNeighbourDescriptors result;
+ auto llBits = LatLonBits();
+ auto llSteps = LatLonSteps();
+ std::pair<i8, i8> bitMove;
+
+ for (auto direction : directions) {
+ bitMove = neighborBitMoves[direction];
+
+ auto newLatBits = llBits.first + bitMove.first;
+ auto newLonBits = llBits.second + bitMove.second;
+
+ // Overflow in lat means polar, so put Nothing
+ if (newLatBits >> llSteps.first != 0) {
+ result[direction] = Nothing();
+ } else {
+ result[direction] = TGeoHashDescriptor(0, Steps);
+ // Overflow in lon means 180-meridian, so just remove overflowed bits
+ newLonBits &= ((1 << llSteps.second) - 1);
+ result[direction]->SetLatLonBits(newLatBits, newLonBits);
+ }
+ }
+
+ return result;
+ }
+
+ TVector<TGeoHashDescriptor> TGeoHashDescriptor::GetChildren(ui8 steps = StepsPerPrecisionUnit) const {
+ TVector<TGeoHashDescriptor> children(Reserve(1 << steps));
+ ui8 childrenSteps = steps + Steps;
+ auto parentBits = Bits << steps;
+ if (childrenSteps > maxSteps) {
+ ythrow yexception() << "Resulting geohash steps are too big, available values: 0.." << ::ToString(maxSteps);
+ }
+ for (auto residue : xrange(1 << steps)) {
+ children.emplace_back(parentBits | residue, childrenSteps);
+ }
+ return children;
+ }
+
+ /* Functions */
+
+ ui64 Encode(double latitude, double longitude, ui8 precision) {
+ auto descr = TGeoHashDescriptor(
+ latitude, longitude, TGeoHashDescriptor::PrecisionToSteps(precision));
+ return descr.GetBits();
+ }
+ ui64 Encode(const NGeo::TPointLL& point, ui8 precision) {
+ return TGeoHashDescriptor(
+ point, TGeoHashDescriptor::PrecisionToSteps(precision))
+ .GetBits();
+ }
+
+ TString EncodeToString(double latitude, double longitude, ui8 precision) {
+ return TGeoHashDescriptor(
+ latitude, longitude, TGeoHashDescriptor::PrecisionToSteps(precision))
+ .ToString();
+ }
+ TString EncodeToString(const NGeo::TPointLL& point, ui8 precision) {
+ return TGeoHashDescriptor(
+ point, TGeoHashDescriptor::PrecisionToSteps(precision))
+ .ToString();
+ }
+
+ NGeo::TPointLL DecodeToPoint(const TString& hashString) {
+ return TGeoHashDescriptor(hashString).ToPoint();
+ }
+ NGeo::TPointLL DecodeToPoint(ui64 hash, ui8 precision) {
+ return TGeoHashDescriptor(hash, TGeoHashDescriptor::PrecisionToSteps(precision)).ToPoint();
+ }
+
+ TBoundingBoxLL DecodeToBoundingBox(const TString& hashString) {
+ return TGeoHashDescriptor(hashString).ToBoundingBox();
+ }
+
+ TBoundingBoxLL DecodeToBoundingBox(ui64 hash, ui8 precision) {
+ return TGeoHashDescriptor(hash, TGeoHashDescriptor::PrecisionToSteps(precision)).ToBoundingBox();
+ }
+
+ TMaybe<ui64> GetNeighbour(ui64 hash, EDirection direction, ui8 precision) {
+ auto neighbour = TGeoHashDescriptor(
+ hash, TGeoHashDescriptor::PrecisionToSteps(precision))
+ .GetNeighbour(direction);
+
+ if (neighbour.Defined()) {
+ return neighbour->GetBits();
+ } else {
+ return Nothing();
+ }
+ }
+
+ TMaybe<TString> GetNeighbour(const TString& hashString, EDirection direction) {
+ auto neighbour = TGeoHashDescriptor(hashString).GetNeighbour(direction);
+ if (neighbour.Defined()) {
+ return neighbour->ToString();
+ } else {
+ return Nothing();
+ }
+ }
+
+ TGeoHashBitsNeighbours GetNeighbours(ui64 hash, ui8 precision) {
+ TGeoHashBitsNeighbours result;
+
+ auto neighbours = TGeoHashDescriptor(
+ hash, TGeoHashDescriptor::PrecisionToSteps(precision))
+ .GetNeighbours();
+
+ for (auto direction : directions) {
+ if (neighbours[direction].Defined()) {
+ result[direction] = neighbours[direction]->GetBits();
+ } else {
+ result[direction] = Nothing();
+ }
+ }
+
+ return result;
+ }
+
+ TGeoHashStringNeighbours GetNeighbours(const TString& hashString) {
+ TGeoHashStringNeighbours result;
+
+ auto neighbours = TGeoHashDescriptor(
+ hashString)
+ .GetNeighbours();
+
+ for (auto direction : directions) {
+ if (neighbours[direction].Defined()) {
+ result[direction] = neighbours[direction]->ToString();
+ } else {
+ result[direction] = Nothing();
+ }
+ }
+ return result;
+ }
+
+ TVector<TString> GetChildren(const TString& hashString) {
+ TVector<TString> result(Reserve(base32EncodeTable.size()));
+
+ for (auto ch : base32EncodeTable) {
+ result.push_back(hashString + ch);
+ }
+ return result;
+ }
+}
diff --git a/library/cpp/geohash/geohash.h b/library/cpp/geohash/geohash.h
new file mode 100644
index 0000000000..7d270612e8
--- /dev/null
+++ b/library/cpp/geohash/geohash.h
@@ -0,0 +1,123 @@
+#pragma once
+
+/**
+ * @file
+ * @brief Strong (because it works) and independent (of contrib/libs/geohash) GeoHash implementation
+ * GeoHash algo: https://en.wikipedia.org/wiki/Geohash
+ * Useful links:
+ * 1. http://geohash.org - Main Site
+ * 2. https://dou.ua/lenta/articles/geohash - Geohash-based geopoints clusterization
+ * 3. http://www.movable-type.co.uk/scripts/geohash.html - bidirectional encoding and visualization
+ */
+#include <library/cpp/geohash/direction.h>
+#include <library/cpp/geohash/direction.h_serialized.h>
+
+#include <library/cpp/geo/geo.h>
+
+#include <util/generic/maybe.h>
+#include <util/generic/string.h>
+#include <util/system/types.h>
+
+#include <array>
+
+namespace NGeoHash {
+ using TBoundingBoxLL = NGeo::TGeoBoundingBox;
+ static constexpr auto directionsCount = GetEnumItemsCount<EDirection>();
+
+ template <class T>
+ class TNeighbours: public std::array<T, directionsCount> {
+ public:
+ TNeighbours() = default;
+
+ TNeighbours(std::initializer_list<T> list) {
+ Y_ASSERT(list.size() == directionsCount);
+ std::copy(list.begin(), list.end(), std::array<T, directionsCount>::begin());
+ }
+
+ const T& operator[](EDirection direction) const {
+ return std::array<T, directionsCount>::operator[](static_cast<size_t>(direction));
+ }
+
+ T& operator[](EDirection direction) {
+ return std::array<T, directionsCount>::operator[](static_cast<size_t>(direction));
+ }
+ };
+
+ class TGeoHashDescriptor {
+ public:
+ TGeoHashDescriptor() noexcept
+ : Bits(0)
+ , Steps(0)
+ {
+ }
+
+ TGeoHashDescriptor(ui64 bits, ui8 steps) noexcept
+ : Bits(bits)
+ , Steps(steps)
+ {
+ }
+
+ TGeoHashDescriptor(double latitude, double longitude, ui8 steps);
+ TGeoHashDescriptor(double latitude, double longitude, const TBoundingBoxLL& limits, ui8 steps);
+ TGeoHashDescriptor(const NGeo::TPointLL& point, ui8 steps);
+ TGeoHashDescriptor(const NGeo::TPointLL& point, const TBoundingBoxLL& limits, ui8 steps);
+
+ explicit TGeoHashDescriptor(const TString& hashString);
+
+ ui64 GetBits() const;
+ ui8 GetSteps() const;
+
+ TString ToString() const;
+
+ NGeo::TPointLL ToPoint(const TBoundingBoxLL& limits) const;
+ NGeo::TPointLL ToPoint() const;
+
+ TBoundingBoxLL ToBoundingBox(const TBoundingBoxLL& limits) const;
+ TBoundingBoxLL ToBoundingBox() const;
+
+ TMaybe<TGeoHashDescriptor> GetNeighbour(EDirection direction) const;
+ TNeighbours<TMaybe<TGeoHashDescriptor>> GetNeighbours() const;
+
+ TVector<TGeoHashDescriptor> GetChildren(ui8 steps) const;
+
+ static ui8 StepsToPrecision(ui8 steps);
+ static ui8 PrecisionToSteps(ui8 precision);
+
+ private:
+ void InitFromLatLon(double latitude, double longitude, const TBoundingBoxLL& limits, ui8 steps);
+ std::pair<ui8, ui8> LatLonSteps() const;
+ std::pair<ui32, ui32> LatLonBits() const;
+ void SetLatLonBits(ui32 latBits, ui32 lonBits);
+ static ui64 Interleave64(ui32 x, ui32 y);
+ static std::pair<ui32, ui32> Deinterleave64(ui64 interleaved);
+
+ private:
+ static const ui8 StepsPerPrecisionUnit = 5;
+ ui64 Bits;
+ ui8 Steps;
+ };
+
+ ui64 Encode(double latitude, double longitude, ui8 precision);
+ ui64 Encode(const NGeo::TPointLL& point, ui8 precision);
+
+ TString EncodeToString(double latitude, double longitude, ui8 precision);
+ TString EncodeToString(const NGeo::TPointLL& point, ui8 precision);
+
+ NGeo::TPointLL DecodeToPoint(const TString& hashString);
+ NGeo::TPointLL DecodeToPoint(ui64 hash, ui8 precision);
+
+ TBoundingBoxLL DecodeToBoundingBox(const TString& hashString);
+ TBoundingBoxLL DecodeToBoundingBox(ui64 hash, ui8 precision);
+
+ TMaybe<ui64> GetNeighbour(ui64 hash, EDirection direction, ui8 precision);
+ TMaybe<TString> GetNeighbour(const TString& hashString, EDirection direction);
+
+ using TGeoHashBitsNeighbours = TNeighbours<TMaybe<ui64>>;
+ using TGeoHashStringNeighbours = TNeighbours<TMaybe<TString>>;
+
+ TGeoHashBitsNeighbours GetNeighbours(ui64 hash, ui8 precision);
+ TGeoHashStringNeighbours GetNeighbours(const TString& hashString);
+
+ TVector<TString> GetChildren(const TString& hashString);
+
+} /* namespace NGeoHash */
diff --git a/library/cpp/geohash/ya.make b/library/cpp/geohash/ya.make
new file mode 100644
index 0000000000..3350ca1cc6
--- /dev/null
+++ b/library/cpp/geohash/ya.make
@@ -0,0 +1,13 @@
+LIBRARY()
+
+PEERDIR(
+ library/cpp/geo
+)
+
+SRCS(
+ geohash.cpp
+)
+
+GENERATE_ENUM_SERIALIZATION_WITH_HEADER(direction.h)
+
+END()
diff --git a/library/cpp/ipreg/CMakeLists.darwin-x86_64.txt b/library/cpp/ipreg/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..05b000b7da
--- /dev/null
+++ b/library/cpp/ipreg/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,53 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-ipreg)
+target_link_libraries(library-cpp-ipreg PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-getopt-small
+ library-cpp-json
+ library-cpp-geobase
+ library-cpp-int128
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-ipreg PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/checker.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/merge.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/range.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/reader.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/split.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/stopwatch.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/writer.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/util_helpers.cpp
+)
+generate_enum_serilization(library-cpp-ipreg
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.h
+ INCLUDE_HEADERS
+ library/cpp/ipreg/address.h
+)
+generate_enum_serilization(library-cpp-ipreg
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.h
+ INCLUDE_HEADERS
+ library/cpp/ipreg/sources.h
+)
diff --git a/library/cpp/ipreg/CMakeLists.linux-aarch64.txt b/library/cpp/ipreg/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..5e76739840
--- /dev/null
+++ b/library/cpp/ipreg/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,54 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-ipreg)
+target_link_libraries(library-cpp-ipreg PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ cpp-getopt-small
+ library-cpp-json
+ library-cpp-geobase
+ library-cpp-int128
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-ipreg PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/checker.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/merge.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/range.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/reader.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/split.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/stopwatch.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/writer.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/util_helpers.cpp
+)
+generate_enum_serilization(library-cpp-ipreg
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.h
+ INCLUDE_HEADERS
+ library/cpp/ipreg/address.h
+)
+generate_enum_serilization(library-cpp-ipreg
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.h
+ INCLUDE_HEADERS
+ library/cpp/ipreg/sources.h
+)
diff --git a/library/cpp/ipreg/CMakeLists.linux-x86_64.txt b/library/cpp/ipreg/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..5e76739840
--- /dev/null
+++ b/library/cpp/ipreg/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,54 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-ipreg)
+target_link_libraries(library-cpp-ipreg PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ cpp-getopt-small
+ library-cpp-json
+ library-cpp-geobase
+ library-cpp-int128
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-ipreg PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/checker.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/merge.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/range.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/reader.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/split.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/stopwatch.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/writer.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/util_helpers.cpp
+)
+generate_enum_serilization(library-cpp-ipreg
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.h
+ INCLUDE_HEADERS
+ library/cpp/ipreg/address.h
+)
+generate_enum_serilization(library-cpp-ipreg
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.h
+ INCLUDE_HEADERS
+ library/cpp/ipreg/sources.h
+)
diff --git a/library/cpp/ipreg/CMakeLists.txt b/library/cpp/ipreg/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/ipreg/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/ipreg/CMakeLists.windows-x86_64.txt b/library/cpp/ipreg/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..05b000b7da
--- /dev/null
+++ b/library/cpp/ipreg/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,53 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+get_built_tool_path(
+ TOOL_enum_parser_bin
+ TOOL_enum_parser_dependency
+ tools/enum_parser/enum_parser
+ enum_parser
+)
+
+add_library(library-cpp-ipreg)
+target_link_libraries(library-cpp-ipreg PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-getopt-small
+ library-cpp-json
+ library-cpp-geobase
+ library-cpp-int128
+ tools-enum_parser-enum_serialization_runtime
+)
+target_sources(library-cpp-ipreg PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/checker.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/merge.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/range.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/reader.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/split.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/stopwatch.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/writer.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/util_helpers.cpp
+)
+generate_enum_serilization(library-cpp-ipreg
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/address.h
+ INCLUDE_HEADERS
+ library/cpp/ipreg/address.h
+)
+generate_enum_serilization(library-cpp-ipreg
+ ${CMAKE_SOURCE_DIR}/library/cpp/ipreg/sources.h
+ INCLUDE_HEADERS
+ library/cpp/ipreg/sources.h
+)
diff --git a/library/cpp/ipreg/address.cpp b/library/cpp/ipreg/address.cpp
new file mode 100644
index 0000000000..83880ccbae
--- /dev/null
+++ b/library/cpp/ipreg/address.cpp
@@ -0,0 +1,365 @@
+#include "address.h"
+
+#include <util/generic/mem_copy.h>
+#include <util/stream/format.h>
+#include <util/string/cast.h>
+#include <util/string/hex.h>
+#include <util/string/printf.h>
+#include <util/string/split.h>
+#include <util/string/type.h>
+#include <util/string/vector.h>
+#include <util/system/byteorder.h>
+#include <util/network/socket.h>
+
+#include <sstream>
+
+namespace NIPREG {
+
+TAddress TAddress::ParseAny(TStringBuf str) {
+ if (str.find(':') != TStringBuf::npos) {
+ return ParseIPv6(str);
+ } else if (str.find('.') != TStringBuf::npos) {
+ return ParseIPv4(str);
+ } else if (IsNumber(str)) {
+ return ParseIPv4Num(str); // TODO(dieash@) IPv6Num
+ }
+
+ ythrow yexception() << "Unrecognized IPREG address format: " << str;
+}
+
+TAddress TAddress::ParseIPv6(TStringBuf str) {
+ TAddress addr;
+ if (inet_pton(AF_INET6, TString(str).c_str(), &addr.Data) != 1)
+ ythrow yexception() << "Failed to parse IPREG address " << str << " as IPv6";
+
+ return addr;
+}
+
+TAddress TAddress::ParseIPv4(TStringBuf str) {
+ struct in_addr ipv4;
+ if (inet_aton(TString(str).c_str(), &ipv4) != 1)
+ ythrow yexception() << "Failed to parse IPREG address " << str << " as IPv4";
+
+ return FromIPv4Num(InetToHost(ipv4.s_addr));
+}
+
+TAddress TAddress::ParseIPv4Num(TStringBuf str) {
+ return FromIPv4Num(FromString<ui32>(str));
+}
+
+TAddress TAddress::ParseIPv6Num(TStringBuf str) {
+ return FromUint128(FromString<ui128>(str));
+}
+
+TAddress TAddress::FromBinary(unsigned char const * const data) {
+ TAddress addr;
+ MemCopy<unsigned char>(addr.Data, data, sizeof(addr.Data));
+ return addr;
+}
+
+TAddress TAddress::FromBinaryIPv4(unsigned char const * const data) {
+ return TAddress::FromIPv4Num(
+ (static_cast<ui32>(data[0]) << 24) |
+ (static_cast<ui32>(data[1]) << 16) |
+ (static_cast<ui32>(data[2]) << 8) |
+ (static_cast<ui32>(data[3]))
+ );
+}
+
+TAddress TAddress::FromIPv4Num(ui32 num) {
+ TAddress addr;
+ memset((void*)&addr.Data, 0x00, 10);
+ addr.Data[10] = 0xff;
+ addr.Data[11] = 0xff;
+ addr.Data[12] = (num >> 24) & 0xff;
+ addr.Data[13] = (num >> 16) & 0xff;
+ addr.Data[14] = (num >> 8) & 0xff;
+ addr.Data[15] = (num) & 0xff;
+ return addr;
+}
+
+TAddress TAddress::FromUint128(ui128 intAddr) {
+ const auto hiBE = HostToInet(GetHigh(intAddr));
+ const auto loBE = HostToInet(GetLow(intAddr));
+
+ TAddress addr;
+ ui64* dataPtr = reinterpret_cast<ui64*>(addr.Data);
+ MemCopy<ui64>(dataPtr, &hiBE, 1);
+ MemCopy<ui64>(dataPtr + 1, &loBE, 1);
+
+ return addr;
+}
+
+namespace {
+ void SetHostsBits(TAddress& addr, char value) {
+ addr.Data[ 8] = value;
+ addr.Data[ 9] = value;
+ addr.Data[10] = value;
+ addr.Data[11] = value;
+ addr.Data[12] = value;
+ addr.Data[13] = value;
+ addr.Data[14] = value;
+ addr.Data[15] = value;
+ }
+} // anon-ns
+
+TAddress TAddress::MakeNet64Broadcast(TAddress base) {
+ SetHostsBits(base, 0xff);
+ return base;
+}
+
+TAddress TAddress::MakeNet64Prefix(TAddress base) {
+ SetHostsBits(base, 0x00);
+ return base;
+}
+
+const TAddress& TAddress::Lowest() {
+ static const TAddress first{{}};
+ return first;
+}
+
+const TAddress& TAddress::Highest() {
+ static const TAddress last{{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}};
+ return last;
+}
+
+TString TAddress::AsIPv4() const {
+ return ToString(Data[12]) + "." + ToString(Data[13]) + "." + ToString(Data[14]) + "." + ToString(Data[15]);
+}
+
+TString TAddress::AsIPv4Num() const {
+ ui32 addr = (ui32)Data[12] << 24 | (ui32)Data[13] << 16 | (ui32)Data[14] << 8 | Data[15];
+ return ToString(addr);
+}
+
+TString TAddress::AsIPv6() const {
+ TStringStream ss;
+
+ for (size_t octet = 0; octet < sizeof(Data); octet++) {
+ ss << Hex(Data[octet], HF_FULL);
+ if (octet < 15 && octet & 1)
+ ss << ':';
+ }
+
+ TString s = ss.Str();
+ s.to_lower();
+
+ return s;
+}
+
+TString TAddress::AsIPv6Num() const {
+ return ToString(AsUint128());
+}
+
+TString TAddress::GetTextFromNetOrder() const {
+ char buf[INET6_ADDRSTRLEN];
+ if (inet_ntop(AF_INET6, (void*)(&Data), buf, sizeof(buf)) == NULL)
+ ythrow yexception() << "Failed to stringify IPREG address";
+
+ return buf;
+}
+
+namespace {
+ TString GetHexStr(ui64 v) {
+ return HexEncode(reinterpret_cast<const char*>(&v), sizeof(v));
+ }
+
+ void HexDumpToStream(std::stringstream& ss, ui64 beData) {
+ const auto dataHexStr = GetHexStr(beData);
+ const auto hostData = InetToHost(beData);
+ const auto hostDataStr = GetHexStr(hostData);
+ ss << "\t/big-end[" << beData << " / " << dataHexStr << "]\t/host[" << hostData << " / " << hostDataStr << "]\n";
+ }
+} // anon-ns
+
+TString TAddress::GetHexString(const bool deepView) const {
+ std::stringstream ss;
+ ss << HexEncode(TStringBuf(reinterpret_cast<const char*>(Data), 16));
+ if (deepView) {
+ const ui64* dataPtr = reinterpret_cast<const ui64*>(Data);
+
+ const auto hi = *dataPtr;
+ ss << "\nhigh-data"; HexDumpToStream(ss, hi);
+
+ const auto lo = *(dataPtr + 1);
+ ss << "\nlow-data"; HexDumpToStream(ss, lo);
+ }
+ return ss.str().c_str();
+}
+
+TString TAddress::AsShortIP() const {
+ if (IsIPv4())
+ return AsIPv4();
+ else
+ return GetTextFromNetOrder();
+}
+
+TString TAddress::AsShortIPv6() const {
+ if (IsIPv4())
+ return Sprintf("::ffff:%x:%x", (ui32)Data[12] << 8 | (ui32)Data[13], (ui32)Data[14] << 8 | (ui32)Data[15]);
+ else
+ return GetTextFromNetOrder();
+}
+
+TString TAddress::AsLongIP() const {
+ if (IsIPv4())
+ return AsIPv4();
+ else
+ return AsIPv6();
+}
+
+ui128 TAddress::AsUint128() const {
+ const ui64* dataPtr = reinterpret_cast<const ui64*>(Data);
+ return ui128(InetToHost(*dataPtr), InetToHost(*(dataPtr + 1)));
+}
+
+ui64 TAddress::GetHigh64() const {
+ const ui64* dataPtr = reinterpret_cast<const ui64*>(Data);
+ return *dataPtr;
+}
+
+ui64 TAddress::GetLow64() const {
+ const ui64* dataPtr = reinterpret_cast<const ui64*>(Data);
+ return *(dataPtr + 1);
+}
+
+ui64 TAddress::GetHigh64LE() const {
+ return InetToHost(GetHigh64());
+}
+
+ui64 TAddress::GetLow64LE() const {
+ return InetToHost(GetLow64());
+}
+
+bool TAddress::IsNet64Broadcast() const {
+ static const auto NET64_HOSTS_MASK = TAddress::ParseAny("::ffff:ffff:ffff:ffff").GetLow64();
+ const auto ownHostsBits = GetLow64();
+ return ownHostsBits == NET64_HOSTS_MASK;
+}
+
+bool TAddress::IsNet64Host() const {
+ const auto isSomeOwnHostsBitsOn = GetLow64() > 0;
+ return isSomeOwnHostsBitsOn && !IsNet64Broadcast();
+}
+
+TString TAddress::Format(EAddressFormat format) const {
+ switch (format) {
+ case EAddressFormat::IPV6:
+ return AsIPv6();
+ case EAddressFormat::LONG_IP:
+ return AsLongIP();
+ case EAddressFormat::SHORT_IP:
+ return AsShortIP();
+ case EAddressFormat::NUMERIC_IPV4:
+ return AsIPv4Num();
+ case EAddressFormat::NUMERIC_IPV6:
+ return AsIPv6Num();
+ case EAddressFormat::NTOA:
+ return GetTextFromNetOrder();
+ case EAddressFormat::SHORT_IPV6:
+ return AsShortIPv6();
+ }
+}
+
+bool TAddress::IsIPv4() const {
+ static const unsigned char mask[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff };
+ return memcmp(Data, mask, sizeof(mask)) == 0;
+}
+
+TAddress TAddress::Next() const {
+ if (Highest() == *this) {
+ return Highest();
+ }
+
+ TAddress addr;
+ bool carry = 1;
+ for (ssize_t octet = 15; octet >= 0; octet--) {
+ addr.Data[octet] = Data[octet] + carry;
+ carry = carry && !addr.Data[octet];
+ }
+
+ return addr;
+}
+
+TAddress TAddress::Prev() const {
+ if (Lowest() == *this) {
+ return Lowest();
+ }
+
+ TAddress addr{};
+ bool carry = 1;
+ for (ssize_t octet = 15; octet >= 0; octet--) {
+ addr.Data[octet] = Data[octet] - carry;
+ carry = carry && !Data[octet];
+ }
+
+ return addr;
+}
+
+double TAddress::operator-(const TAddress& rhs) const {
+ double diff = 0.0;
+ for (ssize_t octet = 0; octet < 16; octet++) {
+ diff = diff * 256.0 + (static_cast<int>(Data[octet]) - static_cast<int>(rhs.Data[octet]));
+ }
+ return diff;
+}
+
+ui128 TAddress::Distance(const TAddress& a, const TAddress& b) {
+ const auto& intA = a.AsUint128();
+ const auto& intB = b.AsUint128();
+ return (a > b) ? (intA - intB) : (intB - intA);
+}
+
+namespace {
+ constexpr size_t MAX_IPV6_MASK_LEN = 16 * 8;
+ constexpr size_t MAX_IPV4_MASK_LEN = 4 * 8;
+ constexpr size_t IPV4_IN6_MASK_BASE = MAX_IPV6_MASK_LEN - MAX_IPV4_MASK_LEN;
+
+ TAddress SetMaskBits(const TAddress& addr, const size_t wantedMaskLen) {
+ auto maskLen = wantedMaskLen;
+ if (addr.IsIPv4() && maskLen && maskLen <= MAX_IPV4_MASK_LEN) {
+ maskLen += IPV4_IN6_MASK_BASE;
+ }
+
+ if (maskLen == 0 || maskLen > MAX_IPV6_MASK_LEN || (addr.IsIPv4() && maskLen < IPV4_IN6_MASK_BASE)) {
+ ythrow yexception() << "strange mask (calc/wanted) " << maskLen << "/" << wantedMaskLen << "; " << addr;
+ }
+
+ const int octetsForUpdate = (MAX_IPV6_MASK_LEN - maskLen) / 8;
+ const int bitsForUpdate = (MAX_IPV6_MASK_LEN - maskLen) % 8;
+
+ size_t currOctet = 15;
+ TAddress addrWithMask = addr;
+
+ for (int octetNum = 0; octetNum != octetsForUpdate; ++octetNum) {
+ addrWithMask.Data[currOctet--] = 0xff;
+ }
+
+ for (int bitNum = 0; bitNum != bitsForUpdate; ++bitNum) {
+ addrWithMask.Data[currOctet] ^= 1 << bitNum;
+ }
+
+ return addrWithMask;
+ }
+} // anon-ns
+
+TNetwork::TNetwork(const TString& str)
+ : TNetwork(static_cast<TVector<TString>>(StringSplitter(str).Split('/').SkipEmpty()))
+{}
+
+TNetwork::TNetwork(const TVector<TString>& data)
+ : TNetwork(data.size() ? data[0] : "",
+ data.size() > 1 ? FromStringWithDefault<size_t>(data[1]) : 0)
+{}
+
+TNetwork::TNetwork(const TString& net, size_t maskLen)
+ : begin(TAddress::ParseAny(net))
+ , end(SetMaskBits(begin, maskLen))
+{}
+
+}
+
+IOutputStream& operator<<(IOutputStream& output, const NIPREG::TAddress& addr) {
+ output << addr.AsShortIPv6();
+ return output;
+}
diff --git a/library/cpp/ipreg/address.h b/library/cpp/ipreg/address.h
new file mode 100644
index 0000000000..9071418d5b
--- /dev/null
+++ b/library/cpp/ipreg/address.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <library/cpp/int128/int128.h>
+
+#include <util/generic/string.h>
+#include <util/digest/murmur.h>
+#include <util/string/cast.h>
+
+namespace NIPREG {
+
+struct TAddress {
+ enum class EAddressFormat {
+ IPV6 = 0x00 /* "ipv6" */,
+ LONG_IP = 0x01 /* "long" */,
+ SHORT_IP = 0x02 /* "short" */,
+ NUMERIC_IPV4 = 0x03 /* "num4" */,
+ NTOA = 0x04 /* "n2a" */,
+ SHORT_IPV6 = 0x05 /* "short-ipv6" */,
+ NUMERIC_IPV6 = 0x06 /* "num" */,
+ };
+
+ unsigned char Data[16] = {0}; // NOTA BENE: network byte order (Big-Endian)
+
+ // Comparison
+ bool operator==(const TAddress& other) const {
+ return memcmp(Data, other.Data, sizeof(Data)) == 0;
+ }
+
+ bool operator<(const TAddress& other) const {
+ return memcmp(Data, other.Data, sizeof(Data)) < 0;
+ }
+
+ bool operator>(const TAddress& other) const {
+ return memcmp(Data, other.Data, sizeof(Data)) > 0;
+ }
+
+ bool operator!=(const TAddress& other) const {
+ return !(*this == other);
+ }
+
+ bool operator<=(const TAddress& other) const {
+ return !(*this > other);
+ }
+
+ bool operator>=(const TAddress& other) const {
+ return !(*this < other);
+ }
+
+ double operator-(const TAddress& rhs) const;
+
+ // Parsing
+ static TAddress ParseAny(TStringBuf str);
+
+ static TAddress ParseIPv6(TStringBuf str);
+ static TAddress ParseIPv4(TStringBuf str);
+ static TAddress ParseIPv4Num(TStringBuf str);
+ static TAddress ParseIPv6Num(TStringBuf str);
+
+ static TAddress FromIPv4Num(ui32 num);
+ static TAddress FromUint128(ui128 addr);
+ static TAddress FromBinary(unsigned char const * data);
+ static TAddress FromBinaryIPv4(unsigned char const * const data);
+
+ static TAddress MakeNet64Broadcast(TAddress base);
+ static TAddress MakeNet64Prefix(TAddress base);
+
+ static const TAddress& Lowest();
+ static const TAddress& Highest();
+
+ // Inspecting
+ TString AsIPv4() const;
+ TString AsIPv4Num() const;
+ TString AsIPv6() const;
+ TString AsIPv6Num() const;
+ TString GetTextFromNetOrder() const;
+ TString GetHexString(bool deepView = false) const;
+
+ TString AsShortIP() const;
+ TString AsShortIPv6() const;
+ TString AsLongIP() const;
+
+ ui128 AsUint128() const;
+ ui64 GetHigh64() const;
+ ui64 GetLow64() const;
+ ui64 GetHigh64LE() const;
+ ui64 GetLow64LE() const;
+
+ bool IsNet64Broadcast() const;
+ bool IsNet64Host() const;
+
+ TAddress GetNet64() const {
+ return TAddress::FromUint128(ui128{GetHigh64LE()} << 64);
+ }
+
+ TAddress GetPrevNet64() const {
+ return TAddress::FromUint128(ui128{GetHigh64LE() - 1} << 64);
+ }
+
+ TAddress GetNextNet64() const {
+ return TAddress::FromUint128(ui128{GetHigh64LE() + 1} << 64);
+ }
+
+ TString Format(EAddressFormat format) const;
+
+ int GetType() const { return IsIPv4() ? 4 : 6; }
+ bool IsIPv4() const;
+
+ // Mutating
+ TAddress Next() const;
+ TAddress Prev() const;
+
+ static ui128 Distance(const TAddress& a, const TAddress& b);
+};
+
+using EAddressFormat = TAddress::EAddressFormat;
+
+struct TNetwork {
+ TAddress begin;
+ TAddress end;
+
+ TNetwork(const TString& str = "0.0.0.0/32");
+
+private:
+ TNetwork(const TVector<TString>& data);
+ TNetwork(const TString& net, size_t mask);
+};
+
+} // NIPREG
+
+template <>
+struct THash<NIPREG::TAddress> {
+ inline size_t operator()(const NIPREG::TAddress& address) const {
+ return MurmurHash<size_t>((const void*)address.Data, 16);
+ }
+};
+
+IOutputStream& operator<<(IOutputStream& output, const NIPREG::TAddress& addr);
diff --git a/library/cpp/ipreg/checker.cpp b/library/cpp/ipreg/checker.cpp
new file mode 100644
index 0000000000..9c41d27dc0
--- /dev/null
+++ b/library/cpp/ipreg/checker.cpp
@@ -0,0 +1,47 @@
+#include "checker.h"
+
+namespace NIPREG {
+
+void TChecker::CheckNextFatal(const TAddress& first, const TAddress& last) {
+ if (!CheckNext(first, last))
+ ythrow yexception() << "IPREG format error: " << first.AsIPv6() << " - " << last.AsIPv6();
+}
+
+TFlatChecker::TFlatChecker() : HasState(false) {
+}
+
+bool TFlatChecker::CheckNext(const TAddress& first, const TAddress& last) {
+ bool result = true;
+
+ if (first > last)
+ result = false;
+
+ if (HasState && first <= PrevLast)
+ result = false;
+
+ PrevLast = last;
+ HasState = true;
+
+ return result;
+}
+
+TIntersectingChecker::TIntersectingChecker() : HasState(false) {
+}
+
+bool TIntersectingChecker::CheckNext(const TAddress& first, const TAddress& last) {
+ bool result = true;
+
+ if (first > last)
+ result = false;
+
+ if (HasState && (first < PrevFirst || (first == PrevFirst && last < PrevLast)))
+ result = false;
+
+ PrevFirst = first;
+ PrevLast = last;
+ HasState = true;
+
+ return result;
+}
+
+}
diff --git a/library/cpp/ipreg/checker.h b/library/cpp/ipreg/checker.h
new file mode 100644
index 0000000000..1a04e62e77
--- /dev/null
+++ b/library/cpp/ipreg/checker.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "address.h"
+
+namespace NIPREG {
+
+class TChecker {
+public:
+ virtual ~TChecker() {}
+
+ virtual bool CheckNext(const TAddress& first, const TAddress& last) = 0;
+
+ void CheckNextFatal(const TAddress& first, const TAddress& last);
+};
+
+class TFlatChecker: public TChecker {
+private:
+ TAddress PrevLast;
+ bool HasState;
+
+public:
+ TFlatChecker();
+ virtual bool CheckNext(const TAddress& first, const TAddress& last);
+};
+
+class TIntersectingChecker: public TChecker {
+private:
+ TAddress PrevFirst;
+ TAddress PrevLast;
+ bool HasState;
+
+public:
+ TIntersectingChecker();
+ virtual bool CheckNext(const TAddress& first, const TAddress& last);
+};
+
+}
diff --git a/library/cpp/ipreg/merge.cpp b/library/cpp/ipreg/merge.cpp
new file mode 100644
index 0000000000..d31e9dce5d
--- /dev/null
+++ b/library/cpp/ipreg/merge.cpp
@@ -0,0 +1,69 @@
+#include "merge.h"
+
+namespace NIPREG {
+
+void MergeIPREGS(TReader &a, TReader& b, std::function<void(const TAddress& first, const TAddress& last, const TString *a, const TString *b)>&& proc) {
+ bool hasA = a.Next();
+ bool hasB = b.Next();
+
+ TAddress top = TAddress::Lowest();
+ TAddress bottom;
+
+ do {
+ // tweak ranges we've passed
+ if (hasA && top > a.Get().Last)
+ hasA = a.Next();
+ if (hasB && top > b.Get().Last)
+ hasB = b.Next();
+
+ if (!hasA && !hasB) {
+ // both rangesets have ended
+ bottom = TAddress::Highest();
+ proc(top, bottom, nullptr, nullptr);
+ break;
+ }
+
+ const bool inA = hasA && a.Get().First <= top;
+ const bool inB = hasB && b.Get().First <= top;
+
+ if (!hasA) {
+ // rangeset a has ended
+ if (inB) {
+ bottom = b.Get().Last;
+ proc(top, bottom, nullptr, &b.Get().Data);
+ } else {
+ bottom = b.Get().First.Prev();
+ proc(top, bottom, nullptr, nullptr);
+ }
+ } else if (!hasB) {
+ // rangeset b has ended
+ if (inA) {
+ bottom = a.Get().Last;
+ proc(top, bottom, &a.Get().Data, nullptr);
+ } else {
+ bottom = a.Get().First.Prev();
+ proc(top, bottom, nullptr, nullptr);
+ }
+ } else if (inA && inB) {
+ // inside both ranges
+ bottom = Min(a.Get().Last, b.Get().Last);
+ proc(top, bottom, &a.Get().Data, &b.Get().Data);
+ } else if (inA) {
+ // only in range a
+ bottom = Min(a.Get().Last, b.Get().First.Prev());
+ proc(top, bottom, &a.Get().Data, nullptr);
+ } else if (inB) {
+ // only in range b
+ bottom = Min(b.Get().Last, a.Get().First.Prev());
+ proc(top, bottom, nullptr, &b.Get().Data);
+ } else {
+ // outside both ranges
+ bottom = Min(a.Get().First.Prev(), a.Get().First.Prev());
+ proc(top, bottom, nullptr, nullptr);
+ }
+
+ top = bottom.Next();
+ } while (bottom != TAddress::Highest());
+}
+
+}
diff --git a/library/cpp/ipreg/merge.h b/library/cpp/ipreg/merge.h
new file mode 100644
index 0000000000..123b88276c
--- /dev/null
+++ b/library/cpp/ipreg/merge.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "reader.h"
+
+#include <functional>
+
+namespace NIPREG {
+
+void MergeIPREGS(TReader &a, TReader& b, std::function<void(const TAddress& first, const TAddress& last, const TString *a, const TString *b)>&& proc);
+
+}
diff --git a/library/cpp/ipreg/range.cpp b/library/cpp/ipreg/range.cpp
new file mode 100644
index 0000000000..1b90022482
--- /dev/null
+++ b/library/cpp/ipreg/range.cpp
@@ -0,0 +1,198 @@
+#include "range.h"
+
+#include "util_helpers.h"
+
+#include <library/cpp/int128/int128.h>
+#include <util/generic/maybe.h>
+#include <util/string/split.h>
+#include <util/string/vector.h>
+
+#include <stdexcept>
+
+namespace NIPREG {
+
+namespace {
+ EAddressFormat CurrentFormat = EAddressFormat::SHORT_IPV6;
+
+ void throwExceptionWithFormat(const TString& line) {
+ throw yexception() << "wanted format: ${ip-begin}-${ip-end}[\t${data}]; $input := '" << line << "'";
+ }
+
+ void throwIfReverseOrder(TAddress first, TAddress last) {
+ if (first > last) {
+ const TString err_msg = "reverse order of addresses (first / last) => " + first.AsIPv6() + " / " + last.AsIPv6();
+ throw std::runtime_error(err_msg.data());
+ }
+ }
+} // anon-ns
+
+TRange::TRange(TAddress first, TAddress last, const TString& data)
+ : First(first)
+ , Last(last)
+ , Data(data)
+{
+ throwIfReverseOrder(First, Last);
+}
+
+TRange::TRange(const TNetwork& net, const TString& data)
+ : TRange(net.begin, net.end, data)
+{
+}
+
+ui128 TRange::GetAddrsQty() const {
+ return TAddress::Distance(First, Last) + 1;
+}
+
+TRange TRange::BuildRange(const TString& line, bool isEmptyData, const TString& dataDelim) {
+ const TVector<TString> parts = StringSplitter(line).SplitBySet(dataDelim.data()).SkipEmpty();
+ if (parts.empty()) {
+ throwExceptionWithFormat(line);
+ }
+
+ if (TString::npos != parts[0].find('/')) {
+ const auto data = (2 == parts.size()) ? parts[1] : "";
+ return TRange(TNetwork(parts[0]), data);
+ }
+
+ const TVector<TString> range_parts = StringSplitter(parts[0]).SplitBySet(" -\t").SkipEmpty();
+ if (2 != range_parts.size() || range_parts[0].empty() || range_parts[1].empty()) {
+ throwExceptionWithFormat(line);
+ }
+
+ if (!isEmptyData && (2 != parts.size() || parts[1].empty())) {
+ throwExceptionWithFormat(line);
+ }
+
+ const auto& data = (2 == parts.size()) ? parts[1] : "";
+ return TRange(TAddress::ParseAny(range_parts[0]), TAddress::ParseAny(range_parts[1]), data);
+}
+
+bool TRange::Contains(const TRange& range) const {
+ return First <= range.First && range.Last <= Last;
+}
+
+bool TRange::Contains(const TAddress& ip) const {
+ return First <= ip && ip <= Last;
+}
+
+void SetIpFullOutFormat() {
+ CurrentFormat = EAddressFormat::IPV6;
+}
+
+void SetIpShortOutFormat() {
+ CurrentFormat = EAddressFormat::SHORT_IPV6;
+}
+
+void TRange::DumpTo(IOutputStream& output, bool withData, EAddressFormat format) const {
+ output << First.Format(format) << '-' << Last.Format(format);
+ if (withData) {
+ output << '\t' << Data;
+ }
+}
+
+bool TRange::IsIpv6Only() const {
+ return 6 == First.GetType() && 6 == Last.GetType();
+}
+
+bool TRange::IsIpv4Only() const {
+ return 4 == First.GetType() && 4 == Last.GetType();
+}
+
+bool TRange::IsRangeInSingleNet64() const {
+ return First.GetHigh64() == Last.GetHigh64();
+}
+
+TRange TRange::BuildRangeByFirst(const TRange& range, int prefix) {
+ Y_UNUSED(prefix);
+ return TRange(TAddress::MakeNet64Prefix(range.First),
+ TAddress::MakeNet64Broadcast(range.IsRangeInSingleNet64() ? range.Last : range.Last.GetPrevNet64()) ,
+ range.Data
+ );
+}
+
+TRange TRange::BuildRangeByLast(const TRange& range, int prefix) {
+ Y_UNUSED(prefix);
+ const auto prevLast = TAddress::MakeNet64Broadcast(range.Last.GetPrevNet64());
+ return TRange(range.First, prevLast, range.Data);
+// const auto prevLast = TAddress::MakeNet64Broadcast(range.Last);
+// return TRange(TAddress::MakeNet64Prefix(range.First), prevLast, range.Data);
+}
+
+TVector<TRange> SplitRangeNets(const TRange& origRange, bool addOrigSize, int maskLen) {
+ Y_UNUSED(maskLen);
+
+ static const auto firstCheckedIpv6Prefix = TAddress::ParseAny("2000::");
+
+ const auto& CalcNetSize = [&](const TRange& range) {
+ static const auto MAX_FOR_DIGITS_ANSWER = ui128{1 << 30};
+ const auto netSize = range.GetAddrsQty();
+ return (netSize < MAX_FOR_DIGITS_ANSWER) ? ToString(netSize) : "huge";
+ };
+
+ const auto& AddSizeField = [&](TRange& changedRange, const TRange& origAddrRange) {
+ if (addOrigSize) {
+ changedRange.Data = AddJsonAttrs({"orig_net_size"}, changedRange.Data, TMaybe<TString>(CalcNetSize(origAddrRange)));
+ }
+ };
+
+ if (origRange.Last <= firstCheckedIpv6Prefix) {
+ return {origRange};
+ }
+
+ if (origRange.IsRangeInSingleNet64()) {
+ TRange theOne{
+ TAddress::MakeNet64Prefix(origRange.First),
+ TAddress::MakeNet64Broadcast(origRange.Last),
+ origRange.Data
+ };
+ AddSizeField(theOne, origRange);
+ return {theOne};
+ }
+
+ TRange range{origRange};
+ TVector<TRange> result; {
+ // 1st
+ TRange byFirst{TAddress::MakeNet64Prefix(range.First),TAddress::MakeNet64Broadcast(range.First), range.Data};
+ AddSizeField(byFirst, {range.First, byFirst.Last, ""});
+ result.push_back(byFirst);
+
+ // maybe 2nd
+ range.First = byFirst.Last.Next();
+ if (!range.IsRangeInSingleNet64()) {
+ const TAddress lastPrefix = TAddress::MakeNet64Prefix(range.Last);
+
+ TRange inTheMiddle{TAddress::MakeNet64Prefix(range.First), lastPrefix.Prev(), range.Data};
+ AddSizeField(inTheMiddle, inTheMiddle);
+ result.push_back(inTheMiddle);
+
+ range.First = lastPrefix;
+ }
+
+ // the last
+ TRange byLast{range.First, TAddress::MakeNet64Broadcast(range.Last), range.Data};
+ AddSizeField(byLast, {byLast.First, range.Last, ""});
+ result.push_back(byLast);
+ }
+ return result;
+}
+
+bool operator==(const TRange& lhs, const TRange& rhs) {
+ return lhs.First == rhs.First && lhs.Last == rhs.Last;
+}
+
+} // ns IPREG
+
+IInputStream& operator>>(IInputStream& input, NIPREG::TRange& range) {
+ TString line;
+ if (!input.ReadLine(line)) {
+ throw std::runtime_error("unable to load data from stream");
+ }
+ range = NIPREG::TRange::BuildRange(line);
+ return input;
+}
+
+IOutputStream& operator<<(IOutputStream& output, const NIPREG::TRange& range) {
+ range.DumpTo(output, true, NIPREG::CurrentFormat);
+ output << "\n";
+ return output;
+}
diff --git a/library/cpp/ipreg/range.h b/library/cpp/ipreg/range.h
new file mode 100644
index 0000000000..15b2c693b0
--- /dev/null
+++ b/library/cpp/ipreg/range.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "address.h"
+
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/stream/input.h>
+#include <util/stream/output.h>
+
+#include <stdexcept>
+
+namespace NIPREG {
+
+struct TRange {
+ TAddress First;
+ TAddress Last;
+ TString Data;
+
+ TRange() = default;
+ TRange(TAddress first, TAddress last, const TString& data);
+ TRange(const TNetwork& net, const TString& data);
+
+ ui128 GetAddrsQty() const;
+ void DumpTo(IOutputStream& output, bool withData = true, EAddressFormat format = EAddressFormat::SHORT_IP) const;
+
+ static TRange BuildRange(const TString& line, bool isEmptyData = false, const TString& dataDelim = "\t");
+ bool Contains(const TRange& range) const;
+ bool Contains(const TAddress& ip) const;
+
+ static TRange BuildRangeByFirst(const TRange& range, int prefix = 64);
+ static TRange BuildRangeByLast(const TRange& range, int prefix = 64);
+
+ bool IsIpv6Only() const;
+ bool IsIpv4Only() const;
+
+ bool IsRangeInSingleNet64() const;
+};
+using TGenericEntry = TRange;
+
+void SetIpFullOutFormat();
+void SetIpShortOutFormat();
+
+TVector<TRange> SplitRangeNets(const TRange& range, bool addOrigSize = false, int maskLen = 64);
+
+bool operator==(const TRange& lhs, const TRange& rhs);
+inline bool operator!=(const TRange& lhs, const TRange& rhs) { return !(lhs == rhs); }
+} // ns NIPREG
+
+IInputStream& operator>>(IInputStream& input, NIPREG::TRange& range);
+IOutputStream& operator<<(IOutputStream& output, const NIPREG::TRange& range);
diff --git a/library/cpp/ipreg/reader.cpp b/library/cpp/ipreg/reader.cpp
new file mode 100644
index 0000000000..2e4ae1b178
--- /dev/null
+++ b/library/cpp/ipreg/reader.cpp
@@ -0,0 +1,82 @@
+#include "reader.h"
+
+#include <util/stream/file.h>
+
+namespace NIPREG {
+
+namespace {
+ const TString DASH_FNAME = "-";
+}
+
+TReader::TReader(const TString& filename, bool isEmptyData, const TString& dataDelim)
+ : OwnedStreamPtr((filename.empty() || filename == DASH_FNAME) ? nullptr : new TFileInput(filename))
+ , Stream(OwnedStreamPtr ? *OwnedStreamPtr.Get() : Cin)
+ , IsEmptyData(isEmptyData)
+ , DataDelim(dataDelim)
+{
+}
+
+TReader::TReader(IInputStream& stream, bool isEmptyData, const TString& dataDelim)
+ : Stream(stream)
+ , IsEmptyData(isEmptyData)
+ , DataDelim(dataDelim)
+{
+}
+
+bool TReader::Next() {
+ TString line;
+ if (!Stream.ReadLine(line))
+ return false;
+
+ CurrentEntry = TRange::BuildRange(line, IsEmptyData, DataDelim);
+ if (CurrentEntry.Data.empty()) {
+ if (!IsEmptyData) {
+ throw yexception() << "empty data part detected for [" << line << "]";
+ }
+ CurrentEntry.Data = "";
+ }
+ return true;
+}
+
+TReverseByLastIpReader::TReverseByLastIpReader(const TString& filename, bool isEmptyData, const TString& dataDelim)
+ : TParent(filename, isEmptyData, dataDelim)
+{
+ Valid = TParent::Next();
+}
+
+TReverseByLastIpReader::TReverseByLastIpReader(IInputStream& stream, bool isEmptyData, const TString& dataDelim)
+ : TParent(stream, isEmptyData, dataDelim)
+{
+ Valid = TParent::Next();
+}
+
+bool TReverseByLastIpReader::Next() {
+ if (!CurrentEntries.empty()) {
+ CurrentEntries.pop_back();
+ }
+
+ if (CurrentEntries.empty()) {
+ return PrepareNextEntries();
+ } else {
+ return true;
+ }
+}
+
+const TGenericEntry& TReverseByLastIpReader::Get() const {
+ return CurrentEntries.back();
+}
+
+bool TReverseByLastIpReader::PrepareNextEntries() {
+ if (!Valid) {
+ return false;
+ }
+
+ do {
+ CurrentEntries.push_back(TParent::Get());
+ Valid = TParent::Next();
+ } while (Valid && TParent::Get().First == CurrentEntries.back().First);
+
+ return true;
+}
+
+} // NIPREG
diff --git a/library/cpp/ipreg/reader.h b/library/cpp/ipreg/reader.h
new file mode 100644
index 0000000000..b68faedcf9
--- /dev/null
+++ b/library/cpp/ipreg/reader.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "range.h"
+
+#include <util/generic/ptr.h>
+#include <util/generic/string.h>
+#include <util/stream/input.h>
+
+namespace NIPREG {
+
+class TReader {
+public:
+ TReader(const TString& filename = "", bool isEmptyData = false, const TString& dataDelim = "\t");
+ TReader(IInputStream& stream, bool isEmptyData = false, const TString& dataDelim = "\t");
+
+ virtual bool Next();
+
+ virtual const TGenericEntry& Get() const {
+ return CurrentEntry;
+ }
+
+ operator IInputStream&() {
+ return Stream;
+ }
+
+ virtual ~TReader() = default;
+
+private:
+ TAutoPtr<IInputStream> OwnedStreamPtr;
+ IInputStream& Stream;
+
+ bool IsEmptyData = false;
+ const TString DataDelim;
+
+ TGenericEntry CurrentEntry;
+};
+
+class TReverseByLastIpReader : public TReader {
+public:
+ using TParent = TReader;
+
+ explicit TReverseByLastIpReader(const TString& filename = "", bool isEmptyData = false, const TString& dataDelim = "\t");
+ explicit TReverseByLastIpReader(IInputStream& stream, bool isEmptyData = false, const TString& dataDelim = "\t");
+
+ bool Next() override;
+
+ const TGenericEntry& Get() const override;
+
+private:
+ bool PrepareNextEntries();
+
+private:
+ bool Valid = false;
+ TVector<TGenericEntry> CurrentEntries;
+};
+
+} // NIPREG
diff --git a/library/cpp/ipreg/sources.cpp b/library/cpp/ipreg/sources.cpp
new file mode 100644
index 0000000000..70e4b2a6da
--- /dev/null
+++ b/library/cpp/ipreg/sources.cpp
@@ -0,0 +1,100 @@
+#include "sources.h"
+
+#include <cstdint>
+#include <stdexcept>
+
+namespace NIPREG {
+
+const ui32 ML_COEFF_DEFAULT = 50000;
+ui32 ML_COEFFICIENT = ML_COEFF_DEFAULT;
+
+void SetCoefficient(ui32 type, ui32 value) {
+ switch (type) {
+ case SOURCE_ML:
+ ML_COEFFICIENT = value;
+ break;
+ default:
+ throw std::runtime_error("unsupported setcoeff-type");
+ }
+}
+
+double GetSourceCoefficient(ui32 type) {
+ switch (type) {
+ case SOURCE_MAIL: return 1;
+ case SOURCE_PHONE: return 3;
+ case SOURCE_GEO: return 4;
+ case SOURCE_COUNTRY: return 100;
+ case SOURCE_DOMAIN_NAME: return 1;
+ case SOURCE_MANUAL: return 1;
+ case SOURCE_YANDEX_NETWORK: return 1000; // NB: in yandex_noc source weight := 10K
+ case SOURCE_SPECIAL_NETWORK: return 1000000;
+ case SOURCE_PROVIDERS: return 50;
+ case SOURCE_MAXMIND: return 4;
+ case SOURCE_UNITED_UID_YANDEX_MAPS: return 0.7;
+ case SOURCE_RELIABILITY_AROUND: return 1;
+ case SOURCE_UNITED_UID_WEATHER: return 0.9;
+ case SOURCE_UNITED_UID_YANDEX_GID: return 1;
+ case SOURCE_UNITED_UID_SEARCH_QUERY: return 1.5;
+ case SOURCE_UNITED_UID_SEARCH_IN_REG: return 2;
+ case SOURCE_BGP_ASPATH_COMMUNITY: return 10;
+ case SOURCE_ML: return ML_COEFFICIENT;
+ }
+ return 0;
+}
+
+bool SourceWantApplyDepthCoeff(ui32 source_type) {
+ switch (source_type) {
+ case SOURCE_MAIL:
+ case SOURCE_PHONE:
+ case SOURCE_GEO:
+ case SOURCE_COUNTRY:
+ case SOURCE_DOMAIN_NAME:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SourceWantApplyNetsizeCoeff(ui32 source_type) {
+ return SourceWantApplyDepthCoeff(source_type);
+}
+
+bool SourceIsHuman(ui32 source_type) {
+ switch (source_type) {
+ case SOURCE_UNITED_UID_SEARCH_QUERY:
+ case SOURCE_UNITED_UID_SEARCH_IN_REG:
+ case SOURCE_UNITED_UID_WEATHER:
+ case SOURCE_UNITED_UID_YANDEX_GID:
+ case SOURCE_UNITED_UID_YANDEX_MAPS:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SourceIsForRegionNormalize(ui32 source_type) {
+ return SourceIsHuman(source_type);
+}
+
+bool SourceIsForEnoughHumanData(ui32 source_type) {
+ switch (source_type) {
+ case SOURCE_COUNTRY:
+ case SOURCE_MANUAL:
+ case SOURCE_PROVIDERS:
+ case SOURCE_YANDEX_NETWORK:
+ case SOURCE_SPECIAL_NETWORK:
+ return true;
+ default:
+ return SourceIsHuman(source_type);
+ }
+}
+
+bool SourceIsForFewHumanData(ui32 source_type) {
+ return !SourceIsHuman(source_type);
+}
+
+bool SourceIsForReliability(ui32 source_type) {
+ return SourceIsHuman(source_type) || SOURCE_YANDEX_NETWORK == source_type;
+}
+
+} // NIPREG
diff --git a/library/cpp/ipreg/sources.h b/library/cpp/ipreg/sources.h
new file mode 100644
index 0000000000..a517e57cb8
--- /dev/null
+++ b/library/cpp/ipreg/sources.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <util/system/types.h>
+
+namespace NIPREG {
+
+// TODO(dieash@) make some automation/spicification via enabled sources (with full list)
+enum ESourceType {
+ // TODO(dieash@) full list of known src-types in choice-region-data:
+ // https://yql.yandex-team.ru/Operations/XEo-amim9Z2_PCkcZgQ0Wu-sqXAm1K8NMPesswuPzbk=
+ SOURCE_UNKNOWN = 0, // stub
+ SOURCE_MAIL = 1 /* "MAIL" */, // ripe src
+ SOURCE_PHONE = 2 /* "PHONE" */, // ripe src
+ SOURCE_GEO = 3 /* "GEO" */, // ripe src
+ SOURCE_COUNTRY = 4 /* "COUNTRY" */, // ripe, delegated, maxmind src
+ SOURCE_DOMAIN_NAME = 5 /* "DOMAIN_NAME" */, // ripe src
+ SOURCE_MANUAL = 6 /* "MANUAL" */, // manual src
+ SOURCE_YANDEX_NETWORK = 9 /* "YANDEX_NETWORK" */, // yandex-noc src
+ SOURCE_SPECIAL_NETWORK = 10 /* "SPECIAL_NETWORK" */, // spec-net src
+ SOURCE_PROVIDERS = 15 /* "PROVIDERS" */, // ripe src
+ SOURCE_MAXMIND = 17 /* "MAXMIND" */, // maxmind src
+ SOURCE_UNITED_UID_YANDEX_MAPS = 19 /* "UNITED_UID_YANDEX_MAPS" */, // uuid src
+ SOURCE_RELIABILITY_AROUND = 20 /* "RELIABILITY_AROUND" */, // rel-around src
+ SOURCE_UNITED_UID_WEATHER = 21 /* "UNITED_UID_WEATHER" */, // uuid src
+ SOURCE_UNITED_UID_YANDEX_GID = 22 /* "UNITED_UID_YANDEX_GID" */, // uuid src
+ SOURCE_UNITED_UID_SEARCH_QUERY = 23 /* "UNITED_UID_SEARCH_QUERY" */, // uuid src
+ SOURCE_UNITED_UID_SEARCH_IN_REG = 24 /* "UNITED_UID_SEARCH_IN_REG" */, // uuid src
+ SOURCE_BGP_ASPATH_COMMUNITY = 25 /* "BGP_ASPATH_COMMUNITY" */, // bgp src // NOTA BENE: clash with https://st.yandex-team.ru/IPREG-3722#5b367ec214778c001a5a3f7c
+ SOURCE_ML_INT_26 = 26 /* "ML_INT_26" */,
+ SOURCE_ML_INT_27 = 27 /* "ML_INT_27" */,
+ SOURCE_ML_INT_28 = 28 /* "ML_INT_28" */,
+ SOURCE_ML_INT_29 = 29 /* "ML_INT_29" */,
+ SOURCE_ML_INT_30 = 30 /* "ML_INT_30" */,
+ SOURCE_ML_INT_31 = 31 /* "ML_INT_31" */,
+ SOURCE_ML_INT_32 = 32 /* "ML_INT_32" */,
+ SOURCE_ML_INT_33 = 33 /* "ML_INT_33" */,
+ SOURCE_ML_INT_34 = 34 /* "ML_INT_34" */,
+ SOURCE_PRECISE_GEO_ML = 35 /* "ML_INT_35" */,
+ SOURCE_ML = 36 /* "ML" */, // ml src
+};
+
+double GetSourceCoefficient(ui32 type);
+bool SourceWantApplyDepthCoeff(ui32 source_type);
+bool SourceWantApplyNetsizeCoeff(ui32 source_type);
+bool SourceIsHuman(ui32 source_type);
+bool SourceExcludeFromReliability(ui32 source_type);
+bool SourceIsForRegionNormalize(ui32 source_type);
+bool SourceIsForEnoughHumanData(ui32 source_type);
+bool SourceIsForFewHumanData(ui32 source_type);
+bool SourceIsForReliability(ui32 source_type);
+
+void SetCoefficient(ui32 type, ui32 value);
+} // namespace NIPREG
diff --git a/library/cpp/ipreg/split.cpp b/library/cpp/ipreg/split.cpp
new file mode 100644
index 0000000000..19b7b85d51
--- /dev/null
+++ b/library/cpp/ipreg/split.cpp
@@ -0,0 +1,54 @@
+#include "split.h"
+
+#include <util/generic/list.h>
+#include <util/generic/vector.h>
+
+namespace NIPREG {
+
+void SplitIPREG(TReader &reader, std::function<void(const TAddress& first, const TAddress& last, const TVector<TString>& data)>&& proc) {
+ TList<TGenericEntry> prevEntries;
+
+ bool end;
+ do {
+ end = !reader.Next();
+
+ while (!prevEntries.empty() && (end || prevEntries.front().First < reader.Get().First)) {
+ // find smallest common range to process
+ TAddress first = prevEntries.front().First;
+ TAddress last = end ? TAddress::Highest() : reader.Get().First.Prev();
+
+ for (const auto& entry: prevEntries)
+ last = Min(last, entry.Last);
+
+ // extract data for the range
+ TVector<TString> strings;
+ auto item = prevEntries.begin();
+ while (item != prevEntries.end()) {
+ Y_ASSERT(item->First == first);
+ strings.push_back(item->Data);
+
+ if (item->Last == last) {
+ // item completely processed, remove
+ auto victim = item;
+ item++;
+ prevEntries.erase(victim);
+ } else {
+ // item still have part of range left, update it
+ item->First = last.Next();
+ item++;
+ }
+ }
+
+ proc(first, last, strings);
+ }
+
+ if (!end) {
+ if (!prevEntries.empty()) {
+ Y_ASSERT(prevEntries.front().First == reader.Get().First);
+ }
+ prevEntries.push_back(reader.Get());
+ }
+ } while (!end);
+}
+
+}
diff --git a/library/cpp/ipreg/split.h b/library/cpp/ipreg/split.h
new file mode 100644
index 0000000000..9710ff5f6d
--- /dev/null
+++ b/library/cpp/ipreg/split.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "reader.h"
+
+#include <util/generic/vector.h>
+
+#include <functional>
+
+namespace NIPREG {
+
+void SplitIPREG(TReader &reader, std::function<void(const TAddress& first, const TAddress& last, const TVector<TString>& data)>&& proc);
+
+}
diff --git a/library/cpp/ipreg/stopwatch.cpp b/library/cpp/ipreg/stopwatch.cpp
new file mode 100644
index 0000000000..31d99d2758
--- /dev/null
+++ b/library/cpp/ipreg/stopwatch.cpp
@@ -0,0 +1,53 @@
+#include "stopwatch.h"
+
+#include <util/stream/str.h>
+
+namespace NIPREG {
+
+TStopWatch::TStopWatch() {
+ Start = TInstant::Now();
+}
+
+TStopWatch::~TStopWatch() {
+ try {
+ if (TaskRunning)
+ StopTask();
+
+ Cerr << "Everything done in " << FormatTime(TInstant::Now() - Start) << Endl;
+ } catch (...) {
+ // not much problem if we can't write the summary
+ }
+}
+
+void TStopWatch::StartTask(const TString& message) {
+ StopTask();
+
+ ++TaskOrdNum;
+ TaskStart = TInstant::Now();
+ TaskRunning = true;
+ Cerr << TaskOrdNum << ". " << message << "...\n";
+}
+
+void TStopWatch::StopTask() {
+ if (TaskRunning) {
+ Cerr << "Done in " << FormatTime(TInstant::Now() - TaskStart) << Endl;
+ TaskRunning = false;
+ }
+}
+
+TString TStopWatch::FormatTime(const TDuration& dur) {
+ auto sec = dur.Seconds();
+
+ TStringStream ss;
+
+ if (sec < 60)
+ ss << sec << "s";
+ else if (sec < 3600)
+ ss << sec / 60 << "m " << sec % 60 << "s";
+ else
+ ss << sec / 3600 << "h " << (sec / 60) % 60 << "m";
+
+ return ss.Str();
+}
+
+}
diff --git a/library/cpp/ipreg/stopwatch.h b/library/cpp/ipreg/stopwatch.h
new file mode 100644
index 0000000000..0873a638f6
--- /dev/null
+++ b/library/cpp/ipreg/stopwatch.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <util/datetime/base.h>
+
+namespace NIPREG {
+
+class TStopWatch {
+private:
+ TInstant Start;
+ TInstant TaskStart;
+ bool TaskRunning = false;
+ ui32 TaskOrdNum = 0;
+
+private:
+ TString FormatTime(const TDuration& dur);
+
+public:
+ TStopWatch();
+ ~TStopWatch();
+
+ void StartTask(const TString& message);
+ void StopTask();
+};
+
+}
diff --git a/library/cpp/ipreg/util_helpers.cpp b/library/cpp/ipreg/util_helpers.cpp
new file mode 100644
index 0000000000..1b64baef55
--- /dev/null
+++ b/library/cpp/ipreg/util_helpers.cpp
@@ -0,0 +1,705 @@
+#include "util_helpers.h"
+
+#include <library/cpp/ipreg/reader.h>
+
+#include <library/cpp/json/json_reader.h>
+#include <library/cpp/json/json_value.h>
+#include <library/cpp/json/json_writer.h>
+
+#include <library/cpp/geobase/lookup.hpp>
+
+#include <util/generic/ptr.h>
+#include <util/generic/vector.h>
+#include <util/stream/file.h>
+#include <util/stream/format.h>
+#include <util/string/split.h>
+#include <util/string/vector.h>
+#include <util/stream/str.h>
+
+namespace NIPREG {
+ namespace {
+ double FindNearestCoarsedCoeff(double baseValue) {
+ using ValueStepPair = std::pair<double, double>;
+ static const double fix = 0.01;
+ static const TVector<ValueStepPair> limits = {
+ { 100., 20. + fix },
+ { 500., 50. + fix },
+ { 2500., 100. + fix },
+ { 10000., 1000. + fix },
+ { 50000., 10000. + fix }
+ };
+
+ double last_step{};
+ for (const auto& pair : limits) {
+ last_step = pair.second;
+ if (baseValue <= pair.first) {
+ break;
+ }
+ }
+ return last_step;
+ }
+
+ double CalcCoarsedValue(double baseValue) {
+ if (baseValue < 0.) {
+ ythrow yexception() << "negative value detected: " << baseValue;
+ }
+
+ // TODO(dieash) some "strange" calculation below
+ const auto coarsedCoeff = FindNearestCoarsedCoeff(baseValue);
+ const double fixedValue = coarsedCoeff * static_cast<int>((baseValue + coarsedCoeff / 2) / coarsedCoeff);
+ return fixedValue;
+ }
+
+ const char * const REL_FIELD = "reliability";
+ const char * const REG_FIELD = "region_id";
+
+ void CorrectReliability(NJson::TJsonValue& jsonData, const TString& data) {
+ jsonData = ParseJsonString(data);
+ auto& jsonMap = jsonData.GetMapSafe();
+
+ auto& reliabilityField = jsonMap[REL_FIELD];
+ reliabilityField = CalcCoarsedValue(reliabilityField.GetDouble());
+ }
+
+ TString SortJson(const TString& data) {
+ NJson::TJsonValue json = ParseJsonString(data);
+ return SortJsonData(json);
+ }
+
+ static TString MergeJsonsData(const TString& data1, const TString& data2, bool sortKeys = false, bool countMerge = false) {
+ static const char* MERGE_QTY = "_mrg_qty_";
+
+ auto json1 = ParseJsonString(data1);
+ const auto& json2 = ParseJsonString(data2);
+
+ if (countMerge && !json1.Has(MERGE_QTY)) {
+ json1.InsertValue(MERGE_QTY, 1);
+ }
+
+ for (const auto& item : json2.GetMapSafe()) {
+ json1.InsertValue(item.first, item.second);
+ }
+
+ if (countMerge) {
+ json1.InsertValue(MERGE_QTY, (json1[MERGE_QTY].GetInteger() + 1));
+ }
+
+ const auto NoFormat = false;
+ return NJson::WriteJson(json1, NoFormat, sortKeys);
+ }
+
+ bool IsJsonEquals(const TVector<TString>& excludeFieldsList, const TString& data1, const TString& data2) {
+ if (excludeFieldsList.empty()) {
+ return data1 == data2;
+ }
+
+ auto json1 = ParseJsonString(data1);
+ auto json2 = ParseJsonString(data2);
+
+ for (const auto& excludeField : excludeFieldsList) {
+ json1.EraseValue(excludeField);
+ json2.EraseValue(excludeField);
+ }
+
+ return json1 == json2;
+ }
+
+ class Patcher {
+ public:
+ Patcher(TReader& base, TReader& patch, IOutputStream& output, bool sortData)
+ : BaseStream(base)
+ , PatchStream(patch)
+ , Output(output)
+ , SortData(sortData)
+ {
+ GetNext(BaseStream, BaseRangePtr);
+ GetNext(PatchStream, PatchRangePtr);
+ }
+
+ void Process() {
+ while (BaseRangePtr || PatchRangePtr) {
+ if ( CheckPatch()
+ || OnlySecond(BaseRangePtr, PatchRangePtr, PatchStream)
+ || OnlySecond(PatchRangePtr, BaseRangePtr, BaseStream)
+ || Range1BeforeRange2(BaseRangePtr, PatchRangePtr, BaseStream)
+ || Range1BeforeRange2(PatchRangePtr, BaseRangePtr, PatchStream)
+ || FirstEndInSecond(BaseRangePtr, PatchRangePtr)
+ || FirstEndInSecond(PatchRangePtr, BaseRangePtr)
+ || FirstStartInSecond(BaseRangePtr, PatchRangePtr, BaseStream, PatchStream))
+ {
+ continue;
+ }
+ }
+ }
+
+ private:
+ void GetNext(TReader& stream, TAutoPtr<TRange>& rangePtr) {
+ if (stream.Next()) {
+ if (rangePtr) {
+ *rangePtr = stream.Get();
+ } else {
+ rangePtr.Reset(new TRange(stream.Get()));
+ }
+ }
+ else {
+ rangePtr.Reset();
+ }
+ }
+
+ void Print(const TRange& range) const {
+ Output << range;
+ }
+
+ void PrintSorted(const TRange& range) const {
+ const TRange sortedCopy{range.First, range.Last, SortJson(range.Data)};
+ Output << sortedCopy;
+ }
+
+ bool CheckPatch() {
+ if (PatchRangePtr && PatchRangePtr->First > PatchRangePtr->Last) {
+ GetNext(PatchStream, PatchRangePtr);
+ return true;
+ }
+ return false;
+ }
+
+ bool OnlySecond(TAutoPtr<TRange>& first, TAutoPtr<TRange>& second, TReader& stream) {
+ if (!first && second) {
+ Print(*second);
+ GetNext(stream, second);
+ return true;
+ }
+ return false;
+ }
+
+ bool Range1BeforeRange2(TAutoPtr<TRange>& first, TAutoPtr<TRange>& second, TReader& stream) {
+ if (first->Last < second->First) {
+ Print(*first);
+ GetNext(stream, first);
+ return true;
+ }
+ return false;
+ }
+
+ bool FirstEndInSecond(TAutoPtr<TRange>& first, TAutoPtr<TRange>& second) {
+ if (first->First < second->First) {
+ auto leftBaseRange = *first;
+ leftBaseRange.Last = second->First.Prev();
+ Print(leftBaseRange);
+
+ first->First = second->First;
+ return true;
+ }
+ return false;
+ }
+
+ bool FirstStartInSecond(TAutoPtr<TRange>& first, TAutoPtr<TRange>& second, TReader& stream1, TReader& stream2) {
+ if (first->First >= second->First) {
+ auto leftBaseRange = *first;
+ leftBaseRange.Data = MergeJsonsData(first->Data, second->Data);
+
+ if (first->Last <= second->Last) {
+ second->First = first->Last.Next();
+ GetNext(stream1, first);
+ if (second->First == TAddress::Highest()) {
+ GetNext(stream2, second);
+ }
+ } else {
+ leftBaseRange.Last = second->Last;
+ first->First = second->Last.Next();
+ GetNext(stream2, second);
+ }
+
+ SortData ? PrintSorted(leftBaseRange) : Print(leftBaseRange);
+ return true;
+ }
+ return false;
+ }
+
+ private:
+ TAutoPtr<TRange> BaseRangePtr;
+ TAutoPtr<TRange> PatchRangePtr;
+
+ TReader& BaseStream;
+ TReader& PatchStream;
+ IOutputStream& Output;
+ const bool SortData = false;
+ };
+
+ struct IpChecker {
+ static void LessOrEqual(const size_t row, const TAddress& lastIp, const TAddress& checkedIp) {
+ if (lastIp <= checkedIp) {
+ return;
+ }
+ GenErr(row, " <= ", lastIp, checkedIp);
+ }
+
+ static void Less(const size_t row, const TAddress& lastIp, const TAddress& checkedIp) {
+ if (lastIp < checkedIp) {
+ return;
+ }
+ GenErr(row, " < ", lastIp, checkedIp);
+ }
+
+ static void GenErr(const size_t row, const char* msg, const TAddress& lastIp, const TAddress& checkedIp) {
+ const TString& errMsg = ">>> row#" + ToString(row) + "; " + lastIp.AsIPv6() + msg + checkedIp.AsIPv6();
+ throw std::runtime_error(errMsg.data());
+ }
+ };
+
+ class MergerBy3 {
+ public:
+ MergerBy3(const TString& geodataPath, IOutputStream& output)
+ : Geobase(geodataPath)
+ , Out(output)
+ {}
+
+ void Process(TReader& input, bool ByRegsOnly, bool silentMode) {
+ while (input.Next()) {
+ Trio.push_back(input.Get());
+ if (3 > Trio.size()) {
+ continue;
+ }
+
+ auto& range2Data = (++Trio.begin())->Data;
+ if (range2Data.npos != range2Data.find("\"is_placeholder\":1")) {
+ PrintAndDrop1stRange();
+ PrintAndDrop1stRange();
+ continue;
+ }
+
+ const auto range1RegId = GetRegionId(Trio.begin()->Data);
+ const auto range3RegId = GetRegionId(Trio.rbegin()->Data);
+ if (range1RegId != range3RegId) {
+ PrintAndDrop1stRange();
+ continue;
+ }
+
+ const auto range2RegId = GetRegionId(range2Data);
+ const auto& parentsIds = Geobase.GetParentsIds(range1RegId);
+ if (parentsIds.end() == std::find(parentsIds.begin() + 1, parentsIds.end(), range2RegId)) {
+ PrintAndDrop1stRange();
+ continue;
+ }
+
+ if (!ByRegsOnly) {
+ const auto range1Size = Trio.begin()->GetAddrsQty();
+ const auto range2Size = (++Trio.begin())->GetAddrsQty();
+ const auto range3Size = Trio.rbegin()->GetAddrsQty();
+
+ if (range2Size > (range1Size + range3Size)) {
+ PrintAndDrop1stRange();
+ continue;
+ }
+ }
+
+ range2Data = SubstRegionId(range2Data, range1RegId);
+ if (!silentMode) {
+ PrintSubstNote(range2RegId, range1RegId);
+ }
+
+ PrintAndDrop1stRange(); // 1st
+ PrintAndDrop1stRange(); // 2nd
+ }
+
+ while (Trio.end() != Trio.begin()) {
+ PrintAndDrop1stRange();
+ }
+ }
+ private:
+ void PrintAndDrop1stRange() {
+ Out << *Trio.begin();
+ Trio.erase(Trio.begin());
+ }
+
+ void PrintSubstNote(const int oldId, const int newId) {
+ const bool NoData = false;
+ Cerr << "s/" << oldId << "/" << newId << "/: [";
+
+ Trio.begin()->DumpTo(Cerr, NoData);
+ Cerr << "/" << Trio.begin()->GetAddrsQty() << " | ";
+
+ const auto& range2nd = *(++Trio.begin());
+ range2nd.DumpTo(Cerr, NoData);
+ Cerr << "/" << range2nd.GetAddrsQty() << " | ";
+
+ Trio.rbegin()->DumpTo(Cerr, NoData);
+ Cerr << "/" << Trio.rbegin()->GetAddrsQty() << "]\n";
+ }
+
+
+ static int GetRegionId(const TString& data) {
+ const auto& json = ParseJsonString(data);
+ auto reg_id = json["region_id"].GetIntegerSafe(0);
+ return 99999 == reg_id ? 10000 : reg_id;
+ }
+
+ static TString SubstRegionId(const TString& data, const int newId) {
+ auto json = ParseJsonString(data);
+ json.InsertValue("region_id", newId);
+ return SortJsonData(json);
+ }
+
+ const NGeobase::TLookup Geobase;
+ IOutputStream& Out;
+ TList<TRange> Trio;
+ };
+ } // anon-ns
+
+ void DoCoarsening(IInputStream& input, IOutputStream& output) {
+ TString line;
+ while (input.ReadLine(line)) {
+ TVector<TString> parts;
+ StringSplitter(line).Split('\t').AddTo(&parts);
+
+ NJson::TJsonValue jsonData;
+ CorrectReliability(jsonData, parts[1]);
+ output << parts[0] << "\t" << "{\""
+ << REG_FIELD << "\":" << jsonData[REG_FIELD] << ",\""
+ << REL_FIELD << "\":" << Prec(jsonData[REL_FIELD].GetDouble(), PREC_POINT_DIGITS_STRIP_ZEROES, 2)
+ << "}\n";
+ }
+ }
+
+ void DoMergeEqualsRange(TReader& input, IOutputStream& output) {
+ // TODO(dieash@) may be check region for parent/child relation
+ // , const TString& geodataPath
+ // NGeobase::TLookup geoLookup(geodataPath);
+
+ TVector<TString> rangeDataList;
+ TRange lastRange{};
+
+ const char* REG_ID_ATTR = "region_id";
+ const char* ORG_NET_ATTR = "orig_net_size";
+ const char* HUGE_SIZE_VALUE = "huge";
+
+ const int HUGE_SIZE_COEFF = 100;
+
+ const auto CalcRegionBinding = [&]() {
+ if (rangeDataList.empty()) {
+ throw std::runtime_error("empty data list");
+ }
+
+ if (1 == rangeDataList.size()) {
+ return rangeDataList[0];
+ }
+
+ size_t maxAmount{};
+ NJson::TJsonValue maxData;
+
+ THashMap<NGeobase::TId, size_t> reg2amount;
+ for (const auto& data : rangeDataList) {
+ const auto& json = ParseJsonString(data);
+
+ const auto id = json[REG_ID_ATTR].GetInteger();
+ const auto amount = (json.Has(ORG_NET_ATTR) && HUGE_SIZE_VALUE == json[ORG_NET_ATTR].GetString()) ? HUGE_SIZE_COEFF : FromString<int>(json[ORG_NET_ATTR].GetString());
+ reg2amount[id] += amount;
+
+ if (reg2amount[id] > maxAmount) {
+ maxData = json;
+ }
+ }
+
+ maxData.EraseValue(ORG_NET_ATTR);
+ return SortJsonData(maxData);
+ };
+
+ const auto PrintRow = [&]() {
+ if (rangeDataList.empty()) {
+ return;
+ }
+ lastRange.Data = CalcRegionBinding();
+ output << lastRange;
+ };
+
+ while (input.Next()) {
+ auto currRange = input.Get();
+ if (currRange != lastRange) {
+ PrintRow();
+
+ lastRange = currRange;
+ rangeDataList = {};
+ }
+
+ rangeDataList.push_back(currRange.Data);
+ }
+ PrintRow();
+ }
+
+ void DoMerging(TReader& input, IOutputStream& output, const MergeTraits& traits) {
+ if (!input.Next()) {
+ return; // empty file here
+ }
+
+ const bool IsJsonData = traits.ConcatSep.empty();
+
+ TRange joinedRange = input.Get();
+ if (traits.SortData) {
+ joinedRange.Data = SortJson(joinedRange.Data);
+ }
+
+ while (input.Next()) {
+ auto currRange = input.Get();
+ if (traits.SortData) {
+ currRange.Data = SortJson(currRange.Data);
+ }
+
+ if (currRange.Contains(joinedRange) && joinedRange.Data == currRange.Data) {
+ joinedRange = currRange;
+ continue;
+ }
+
+ if (traits.JoinNestedRanges && joinedRange.Contains(currRange) && joinedRange.Data == currRange.Data) {
+ continue;
+ }
+
+ if ( currRange.First != joinedRange.Last.Next()
+ || ( IsJsonData && !IsJsonEquals(traits.ExcludeFieldsList, currRange.Data, joinedRange.Data))
+ || (!IsJsonData && currRange.Data != joinedRange.Data))
+ {
+ output << joinedRange;
+ joinedRange = currRange;
+ } else {
+ if (IsJsonData) {
+ joinedRange.Data = MergeJsonsData(currRange.Data, joinedRange.Data, traits.SortData, traits.CountMerges);
+ } else {
+ joinedRange.Data = (joinedRange.Data == currRange.Data) ? joinedRange.Data : (joinedRange.Data + traits.ConcatSep + currRange.Data);
+ }
+ joinedRange.Last = currRange.Last;
+ }
+ }
+
+ output << joinedRange;
+ }
+
+ void DoMerging3(TReader& input, IOutputStream& output, const TString& geodata, bool ByRegsOnly, bool silentMode) {
+ MergerBy3 merger(geodata, output);
+ merger.Process(input, ByRegsOnly, silentMode);
+ }
+
+ void DoPatching(TReader& base, TReader& patch, IOutputStream& output, bool sortData) {
+ Patcher(base, patch, output, sortData).Process();
+ }
+
+ const TString STUB_DATA{"{\"is_placeholder\":1,\"region_id\":10000,\"reliability\":0}"};
+
+ void AddStubRanges(TReader& input, IOutputStream& output) {
+ TRange stub{
+ TAddress::Lowest(),
+ TAddress::Lowest(),
+ STUB_DATA
+ };
+
+ while (input.Next()) {
+ const auto& currRange = input.Get();
+
+ if (stub.First > currRange.First) {
+ const TString& errMsg = ">>> bad ranges ($stub.begin > $next.begin) // " + stub.First.AsShortIPv6() + " | " + currRange.First.AsShortIPv6();
+ throw std::runtime_error(errMsg.data());
+ }
+
+ if (stub.First < currRange.First) {
+ stub.Last = currRange.First.Prev();
+ output << stub;
+ }
+
+ output << currRange;
+ stub.First = currRange.Last.Next();
+ }
+
+ if (stub.First != TAddress::Highest()) {
+ stub.Last = TAddress::Highest();
+ output << stub;
+ }
+ }
+
+ void CheckAddressSpaceForCompleteness(IInputStream& input, IOutputStream& output) {
+ TAddress lastIp = TAddress::Lowest();
+ size_t row_number = 0;
+
+ TString line;
+ while (input.ReadLine(line)) {
+ ++row_number;
+ output << line << "\n";
+
+ const auto& currRange = TRange::BuildRange(line);
+ if (row_number == 1) {
+ if (currRange.First != TAddress::Lowest()) {
+ const TString err_msg = "bad first addr (ip / wanted_ip) => " + currRange.First.AsIPv6() + " / " + TAddress::Lowest().AsIPv6();
+ throw std::runtime_error(err_msg);
+ }
+ lastIp = currRange.Last;
+ continue;
+ }
+
+ if (lastIp == currRange.First || lastIp.Next() != currRange.First) {
+ const TString err_msg = ">>> row#" + ToString(row_number) + " bad pair (last_ip / next_ip) => " + lastIp.AsIPv6() + " / " + currRange.First.AsIPv6();
+ throw std::runtime_error(err_msg);
+ }
+
+ lastIp = currRange.Last;
+ }
+
+ if (lastIp != TAddress::Highest()) {
+ const TString err_msg = "bad last addr (last_ip / wanted_ip) => " + lastIp.AsIPv6() + " / " + TAddress::Highest().AsIPv6();
+ throw std::runtime_error(err_msg);
+ }
+ }
+
+ void CheckRangesForMonotonicSequence(IInputStream& input, IOutputStream& output, bool IsStrict) {
+ TAddress lastIp = TAddress::Lowest();
+
+ size_t row = 0;
+ TString line;
+ while (input.ReadLine(line)) {
+ ++row;
+ output << line << "\n";
+
+ const auto& currRange = TRange::BuildRange(line);
+ if (row == 1) {
+ lastIp = currRange.Last;
+ continue;
+ }
+
+ if (IsStrict) {
+ IpChecker::Less(row, lastIp, currRange.First);
+ } else {
+ IpChecker::LessOrEqual(row, lastIp, currRange.First);
+ }
+ lastIp = currRange.Last;
+ }
+ }
+
+ NJson::TJsonValue ParseJsonString(const TString& data) {
+ const auto throwIfError = true;
+
+ NJson::TJsonValue json;
+ NJson::ReadJsonFastTree(data, &json, throwIfError);
+ return json;
+ }
+
+ TString SortJsonData(const NJson::TJsonValue& json) {
+ const auto NoFormat = false;
+ const auto SortKeys = true;
+
+ return NJson::WriteJson(json, NoFormat, SortKeys);
+ }
+
+ TString SortJsonData(const TString& jsonStr) {
+ return SortJsonData(ParseJsonString(jsonStr));
+ }
+
+ TString AddJsonAttrs(const TVector<TString>& addFieldsList, const TString& jsonStr, const TMaybe<TString>& attrValue) {
+ if (addFieldsList.empty()) {
+ return jsonStr;
+ }
+
+ auto json = ParseJsonString(jsonStr);
+ for (const auto& newField : addFieldsList) {
+ if (!newField.empty()) {
+ if (attrValue) {
+ json.InsertValue(newField, *attrValue);
+ } else {
+ json.InsertValue(newField, 1);
+ }
+ }
+ }
+ return json.GetStringRobust();
+ }
+
+ TString ExcludeJsonAttrs(const TVector<TString>& excludeFieldsList, const TString& jsonStr) {
+ if (excludeFieldsList.empty()) {
+ return jsonStr;
+ }
+
+ auto json = ParseJsonString(jsonStr);
+ for (const auto& excludeField : excludeFieldsList) {
+ if (!excludeField.empty()) {
+ json.EraseValue(excludeField);
+ }
+ }
+ return json.GetStringRobust();
+ }
+
+ TString ExtractJsonAttrs(const TVector<TString>& extractFieldsList, const TString& jsonStr) {
+ if (extractFieldsList.empty()) {
+ return jsonStr;
+ }
+
+ auto json = ParseJsonString(jsonStr);
+ NJson::TJsonValue newJson;
+ for (const auto& field : extractFieldsList) {
+ if (json.Has(field)) {
+ newJson.InsertValue(field, json[field]);
+ }
+ }
+ if (!newJson.IsDefined()) {
+ return {};
+ }
+ return newJson.GetStringRobust();
+ }
+
+ namespace CliParamsDesc {
+ const TString InputFnameParam = "input-data";
+ const TString OutputFnameParam = "output-data";
+ const TString OutputFullIpParam = "show-full-ip";
+ const TString PrintStatsParam = "print-stats";
+ const TString PrintYtStatsParam = "yt-stats";
+
+ const TString InputFnameParamDesc = "path to input IPREG-data; leave empty or use '-' for stdin";
+ const TString OutputFnameParamDesc = "path to file for output results; leave empty for stdout";
+ const TString OutputFullIpParamDesc = "print full ipv6 (by default - short)";
+ const TString PrintStatsParamDesc = "print internal statistics; @stderr";
+ const TString PrintYtStatsParamDesc = "print YT-stats (by default, file-descriptor 5)";
+ } // ns CliParamsDesc
+
+ DefaultCliParams::DefaultCliParams() {
+ using namespace CliParamsDesc;
+
+ Opts.SetFreeArgsMax(0);
+ Opts.AddHelpOption('h');
+
+ Opts.AddLongOption('i', InputFnameParam)
+ .RequiredArgument("filename")
+ .DefaultValue(InputFname)
+ .StoreResult(&InputFname).Help(InputFnameParamDesc);
+
+ Opts.AddLongOption('o', OutputFnameParam)
+ .RequiredArgument("filename")
+ .DefaultValue(OutputFname)
+ .StoreResult(&OutputFname).Help(OutputFnameParamDesc);
+
+ Opts.AddLongOption('f', OutputFullIpParam)
+ .Optional()
+ .NoArgument()
+ .DefaultValue("0")
+ .OptionalValue("1")
+ .StoreResult(&OutputFullIp).Help(OutputFullIpParamDesc);
+
+ Opts.AddLongOption(PrintStatsParam)
+ .Optional()
+ .NoArgument()
+ .DefaultValue("0")
+ .OptionalValue("1")
+ .StoreResult(&PrintStats).Help(PrintStatsParamDesc);
+
+ Opts.AddLongOption(PrintYtStatsParam)
+ .Optional()
+ .NoArgument()
+ .DefaultValue("0")
+ .OptionalValue("1")
+ .StoreResult(&PrintYtStats).Help(PrintYtStatsParamDesc);
+ }
+
+ void DefaultCliParams::ApplyFlags() const {
+ if (OutputFullIp) {
+ SetIpFullOutFormat();
+ }
+ }
+
+ void DefaultCliParams::Parse(int argc, const char **argv) {
+ NLastGetopt::TOptsParseResult optRes(&GetOpts(), argc, argv);
+ ApplyFlags();
+ }
+
+} // NIPREG
diff --git a/library/cpp/ipreg/util_helpers.h b/library/cpp/ipreg/util_helpers.h
new file mode 100644
index 0000000000..eab2dfb320
--- /dev/null
+++ b/library/cpp/ipreg/util_helpers.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <library/cpp/getopt/opt.h>
+#include <util/generic/string.h>
+#include <util/generic/maybe.h>
+
+class IInputStream;
+class IOutputStream;
+
+namespace NJson {
+ class TJsonValue;
+}
+
+namespace NIPREG {
+ class TReader;
+
+ // @input any form of range+payload
+ // @output $ip.begin-$ip.end \t {"region_id":$reg,"reliability":$rel}
+ void DoCoarsening(IInputStream& input, IOutputStream& output);
+
+ struct MergeTraits {
+ const TVector<TString> ExcludeFieldsList;
+ TString ConcatSep;
+ bool SortData{};
+ bool CountMerges{};
+ bool JoinNestedRanges{};
+ };
+
+ void DoMerging(TReader& input, IOutputStream& output, const MergeTraits& traits);
+ void DoMerging3(TReader& input, IOutputStream& output, const TString& geodata, bool ByRegsOnly = false, bool silentMode = false);
+ void DoMergeEqualsRange(TReader& input, IOutputStream& output);
+
+ void DoPatching(TReader& base, TReader& patch, IOutputStream& output, bool sortData = false);
+
+ void AddStubRanges(TReader& input, IOutputStream& output);
+
+ void CheckAddressSpaceForCompleteness(IInputStream& input, IOutputStream& output);
+ void CheckRangesForMonotonicSequence(IInputStream& input, IOutputStream& output, bool IsStrict = false);
+
+ NJson::TJsonValue ParseJsonString(const TString& data);
+ TString SortJsonData(const NJson::TJsonValue& json);
+ TString SortJsonData(const TString& json);
+
+ TString AddJsonAttrs(const TVector<TString>& addFieldsList, const TString& jsonStr, const TMaybe<TString>& attrValue);
+ TString ExcludeJsonAttrs(const TVector<TString>& excludeFieldsList, const TString& jsonStr);
+ TString ExtractJsonAttrs(const TVector<TString>& excludeFieldsList, const TString& jsonStr);
+
+ extern const TString STUB_DATA;
+
+ struct DefaultCliParams {
+ DefaultCliParams();
+
+ NLastGetopt::TOpts& GetOpts() { return Opts; }
+ void Parse(int argc, const char **argv);
+ void ApplyFlags() const;
+
+ TString InputFname = "-";
+ TString OutputFname = "";
+ bool OutputFullIp = false;
+ bool PrintStats = false;
+ bool PrintYtStats = false;
+
+ NLastGetopt::TOpts Opts;
+ };
+} // NIPREG
diff --git a/library/cpp/ipreg/writer.cpp b/library/cpp/ipreg/writer.cpp
new file mode 100644
index 0000000000..89f8c8b629
--- /dev/null
+++ b/library/cpp/ipreg/writer.cpp
@@ -0,0 +1,91 @@
+#include "writer.h"
+
+#include <util/stream/file.h>
+
+namespace NIPREG {
+
+TWriter::TWriter(const TString& fname)
+ : OwnedStreamPtr(fname.empty() ? nullptr : new TFileOutput(fname))
+ , Stream(OwnedStreamPtr ? *OwnedStreamPtr.Get() : Cout)
+ , AddrSeparator(ADDR_SEP)
+ , DataSeparator(DATA_SEP)
+ , SplitMixed(false)
+{
+}
+
+TWriter::TWriter(IOutputStream& stream, EAddressFormat addressFormat, const TString& addrSep, const TString& dataSep, const bool splitMixed)
+ : Stream(stream)
+ , AddressFormat(addressFormat)
+ , AddrSeparator(addrSep)
+ , DataSeparator(dataSep)
+ , SplitMixed(splitMixed)
+{
+}
+
+namespace {
+ const TAddress IPv4Start = TAddress::ParseIPv4("0.0.0.0");
+ const TAddress IPv4End = TAddress::ParseIPv4("255.255.255.255");
+
+ const TAddress IPv6BeforeV4 = IPv4Start.Prev();
+ const TAddress IPv6AfterV4 = IPv4End.Next();
+}
+
+void TWriter::Write(const TAddress& first, const TAddress& last, const TString& data, bool printRange) {
+ if (SplitMixed) {
+ if (first < IPv4Start && IPv4Start < last) {
+ Write(first, IPv6BeforeV4, data, printRange);
+ Write(IPv4Start, last, data, printRange);
+ return;
+ }
+
+ if (first < IPv4End && IPv4End < last) {
+ Write(first, IPv4End, data, printRange);
+ Write(IPv6AfterV4, last, data, printRange);
+ return;
+ }
+ }
+ WriteImpl(first, last, data, printRange);
+}
+
+void TWriter::WriteImpl(const TAddress& first, const TAddress& last, const TString& data, bool printRange) {
+ if (printRange) {
+ Stream << first.Format(AddressFormat) << AddrSeparator << last.Format(AddressFormat);
+ }
+ if (!data.empty()) {
+ if (printRange) {
+ Stream << DataSeparator;
+ }
+ Stream << data;
+ }
+ if (!data.empty() || printRange) {
+ Stream << "\n";
+ }
+}
+
+void TWriter::Finalize() {
+}
+
+TMergingWriter::TMergingWriter(IOutputStream& stream, EAddressFormat addressFormat, const TString& addrSep, const TString& dataSep, const bool splitMixed)
+ : TWriter(stream, addressFormat, addrSep, dataSep, splitMixed) {
+}
+
+void TMergingWriter::Write(const TAddress& first, const TAddress& last, const TString& data, bool) {
+ if (Initialized && data == StoredData && first == StoredLast.Next()) {
+ StoredLast = last;
+ } else {
+ if (Initialized)
+ TWriter::Write(StoredFirst, StoredLast, StoredData);
+ StoredFirst = first;
+ StoredLast = last;
+ StoredData = data;
+ Initialized = true;
+ }
+}
+
+void TMergingWriter::Finalize() {
+ if (Initialized)
+ TWriter::Write(StoredFirst, StoredLast, StoredData);
+ Initialized = false;
+}
+
+} // NIPREG
diff --git a/library/cpp/ipreg/writer.h b/library/cpp/ipreg/writer.h
new file mode 100644
index 0000000000..a4232a89a6
--- /dev/null
+++ b/library/cpp/ipreg/writer.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include "range.h"
+
+#include <util/generic/ptr.h>
+#include <util/generic/string.h>
+#include <util/stream/output.h>
+
+namespace NIPREG {
+
+class TWriter {
+public:
+ static constexpr char const * const ADDR_SEP = "-";
+ static constexpr char const * const DATA_SEP = "\t";
+
+public:
+ TWriter(const TString& filename = "");
+ TWriter(IOutputStream& stream, EAddressFormat addressFormat = EAddressFormat::IPV6, const TString& addrSep = ADDR_SEP, const TString& dataSep = DATA_SEP, const bool splitMixed = false);
+ TWriter(IOutputStream& stream, const TString& addrSep, EAddressFormat addressFormat)
+ : TWriter(stream, addressFormat, addrSep, addrSep)
+ {}
+ virtual ~TWriter() {}
+
+ void Write(const TGenericEntry& entry, bool printRange = true) {
+ Write(entry.First, entry.Last, entry.Data, printRange);
+ }
+ virtual void Write(const TAddress& first, const TAddress& last, const TString& data, bool printRange = true);
+ virtual void Finalize();
+
+ operator IOutputStream&() {
+ return Stream;
+ }
+
+private:
+ void WriteImpl(const TAddress& first, const TAddress& last, const TString& data, bool printRange);
+
+ TAutoPtr<IOutputStream> OwnedStreamPtr;
+ IOutputStream& Stream;
+
+ EAddressFormat AddressFormat = EAddressFormat::IPV6;
+ const TString AddrSeparator = ADDR_SEP;
+ const TString DataSeparator = DATA_SEP;
+ const bool SplitMixed;
+};
+
+class TMergingWriter : public TWriter {
+public:
+ TMergingWriter(IOutputStream& stream, EAddressFormat addressFormat = EAddressFormat::IPV6, const TString& addrSep = ADDR_SEP, const TString& dataSep = DATA_SEP, const bool splitMixed = false);
+ TMergingWriter(IOutputStream& stream, const TString& addrSep, EAddressFormat addressFormat)
+ : TWriter(stream, addressFormat, addrSep, addrSep)
+ {}
+ void Write(const TAddress& first, const TAddress& last, const TString& data, bool printRange = true) final override;
+ void Finalize() final;
+
+private:
+ TAddress StoredFirst;
+ TAddress StoredLast;
+ TString StoredData;
+ bool Initialized = false;
+};
+
+} // NIPREG
diff --git a/library/cpp/ipreg/ya.make b/library/cpp/ipreg/ya.make
new file mode 100644
index 0000000000..b03720f761
--- /dev/null
+++ b/library/cpp/ipreg/ya.make
@@ -0,0 +1,26 @@
+LIBRARY()
+
+SRCS(
+ address.cpp
+ checker.cpp
+ merge.cpp
+ range.cpp
+ reader.cpp
+ sources.cpp
+ split.cpp
+ stopwatch.cpp
+ writer.cpp
+ util_helpers.cpp
+)
+
+PEERDIR(
+ library/cpp/getopt/small
+ library/cpp/json
+ library/cpp/geobase
+ library/cpp/int128
+)
+
+GENERATE_ENUM_SERIALIZATION(address.h)
+GENERATE_ENUM_SERIALIZATION(sources.h)
+
+END()
diff --git a/library/cpp/langmask/CMakeLists.txt b/library/cpp/langmask/CMakeLists.txt
new file mode 100644
index 0000000000..499930c4b0
--- /dev/null
+++ b/library/cpp/langmask/CMakeLists.txt
@@ -0,0 +1,9 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+add_subdirectory(proto)
diff --git a/library/cpp/langmask/proto/CMakeLists.darwin-x86_64.txt b/library/cpp/langmask/proto/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..e9f692d0f2
--- /dev/null
+++ b/library/cpp/langmask/proto/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,43 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(cpp-langmask-proto)
+target_link_libraries(cpp-langmask-proto PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-protobuf
+)
+target_proto_messages(cpp-langmask-proto PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/langmask/proto/langmask.proto
+)
+target_proto_addincls(cpp-langmask-proto
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(cpp-langmask-proto
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/langmask/proto/CMakeLists.linux-aarch64.txt b/library/cpp/langmask/proto/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..61f975983e
--- /dev/null
+++ b/library/cpp/langmask/proto/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,44 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(cpp-langmask-proto)
+target_link_libraries(cpp-langmask-proto PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-protobuf
+)
+target_proto_messages(cpp-langmask-proto PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/langmask/proto/langmask.proto
+)
+target_proto_addincls(cpp-langmask-proto
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(cpp-langmask-proto
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/langmask/proto/CMakeLists.linux-x86_64.txt b/library/cpp/langmask/proto/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..61f975983e
--- /dev/null
+++ b/library/cpp/langmask/proto/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,44 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(cpp-langmask-proto)
+target_link_libraries(cpp-langmask-proto PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-protobuf
+)
+target_proto_messages(cpp-langmask-proto PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/langmask/proto/langmask.proto
+)
+target_proto_addincls(cpp-langmask-proto
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(cpp-langmask-proto
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/langmask/proto/CMakeLists.txt b/library/cpp/langmask/proto/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/langmask/proto/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/langmask/proto/CMakeLists.windows-x86_64.txt b/library/cpp/langmask/proto/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..e9f692d0f2
--- /dev/null
+++ b/library/cpp/langmask/proto/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,43 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(cpp-langmask-proto)
+target_link_libraries(cpp-langmask-proto PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-protobuf
+)
+target_proto_messages(cpp-langmask-proto PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/langmask/proto/langmask.proto
+)
+target_proto_addincls(cpp-langmask-proto
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(cpp-langmask-proto
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/langmask/proto/langmask.proto b/library/cpp/langmask/proto/langmask.proto
new file mode 100644
index 0000000000..be23ecfbba
--- /dev/null
+++ b/library/cpp/langmask/proto/langmask.proto
@@ -0,0 +1,6 @@
+package NProto;
+
+message TLangMask {
+ repeated uint32 Bits = 1; // binary
+ optional string Names = 2; // human readable
+}
diff --git a/library/cpp/langmask/proto/ya.make b/library/cpp/langmask/proto/ya.make
new file mode 100644
index 0000000000..823a0ad261
--- /dev/null
+++ b/library/cpp/langmask/proto/ya.make
@@ -0,0 +1,11 @@
+PROTO_LIBRARY()
+
+SRCS(
+ langmask.proto
+)
+
+IF (NOT PY_PROTOS_FOR)
+ EXCLUDE_TAGS(GO_PROTO)
+ENDIF()
+
+END()
diff --git a/library/cpp/microbdb/CMakeLists.darwin-x86_64.txt b/library/cpp/microbdb/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..c4d2e9d3a4
--- /dev/null
+++ b/library/cpp/microbdb/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,56 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+find_package(ZLIB REQUIRED)
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(library-cpp-microbdb)
+target_link_libraries(library-cpp-microbdb PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-fastlz
+ contrib-libs-libc_compat
+ contrib-libs-protobuf
+ contrib-libs-snappy
+ ZLIB::ZLIB
+ cpp-deprecated-fgood
+ cpp-on_disk-st_hash
+ library-cpp-packedtypes
+)
+target_proto_messages(library-cpp-microbdb PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/noextinfo.proto
+)
+target_sources(library-cpp-microbdb PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/file.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/header.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/microbdb.cpp
+)
+target_proto_addincls(library-cpp-microbdb
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(library-cpp-microbdb
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/microbdb/CMakeLists.linux-aarch64.txt b/library/cpp/microbdb/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..302dbd03cd
--- /dev/null
+++ b/library/cpp/microbdb/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,57 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+find_package(ZLIB REQUIRED)
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(library-cpp-microbdb)
+target_link_libraries(library-cpp-microbdb PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-fastlz
+ contrib-libs-libc_compat
+ contrib-libs-protobuf
+ contrib-libs-snappy
+ ZLIB::ZLIB
+ cpp-deprecated-fgood
+ cpp-on_disk-st_hash
+ library-cpp-packedtypes
+)
+target_proto_messages(library-cpp-microbdb PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/noextinfo.proto
+)
+target_sources(library-cpp-microbdb PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/file.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/header.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/microbdb.cpp
+)
+target_proto_addincls(library-cpp-microbdb
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(library-cpp-microbdb
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/microbdb/CMakeLists.linux-x86_64.txt b/library/cpp/microbdb/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..302dbd03cd
--- /dev/null
+++ b/library/cpp/microbdb/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,57 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+find_package(ZLIB REQUIRED)
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(library-cpp-microbdb)
+target_link_libraries(library-cpp-microbdb PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-fastlz
+ contrib-libs-libc_compat
+ contrib-libs-protobuf
+ contrib-libs-snappy
+ ZLIB::ZLIB
+ cpp-deprecated-fgood
+ cpp-on_disk-st_hash
+ library-cpp-packedtypes
+)
+target_proto_messages(library-cpp-microbdb PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/noextinfo.proto
+)
+target_sources(library-cpp-microbdb PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/file.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/header.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/microbdb.cpp
+)
+target_proto_addincls(library-cpp-microbdb
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(library-cpp-microbdb
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/microbdb/CMakeLists.txt b/library/cpp/microbdb/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/microbdb/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/microbdb/CMakeLists.windows-x86_64.txt b/library/cpp/microbdb/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..c4d2e9d3a4
--- /dev/null
+++ b/library/cpp/microbdb/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,56 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+find_package(ZLIB REQUIRED)
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(library-cpp-microbdb)
+target_link_libraries(library-cpp-microbdb PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-fastlz
+ contrib-libs-libc_compat
+ contrib-libs-protobuf
+ contrib-libs-snappy
+ ZLIB::ZLIB
+ cpp-deprecated-fgood
+ cpp-on_disk-st_hash
+ library-cpp-packedtypes
+)
+target_proto_messages(library-cpp-microbdb PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/noextinfo.proto
+)
+target_sources(library-cpp-microbdb PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/file.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/header.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/microbdb/microbdb.cpp
+)
+target_proto_addincls(library-cpp-microbdb
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(library-cpp-microbdb
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/microbdb/align.h b/library/cpp/microbdb/align.h
new file mode 100644
index 0000000000..2f8567f134
--- /dev/null
+++ b/library/cpp/microbdb/align.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <util/system/defaults.h>
+
+using TDatAlign = int;
+
+static inline size_t DatFloor(size_t size) {
+ return (size - 1) & ~(sizeof(TDatAlign) - 1);
+}
+
+static inline size_t DatCeil(size_t size) {
+ return DatFloor(size) + sizeof(TDatAlign);
+}
+
+static inline void DatSet(void* ptr, size_t size) {
+ *(TDatAlign*)((char*)ptr + DatFloor(size)) = 0;
+}
diff --git a/library/cpp/microbdb/compressed.h b/library/cpp/microbdb/compressed.h
new file mode 100644
index 0000000000..f0c9edfa92
--- /dev/null
+++ b/library/cpp/microbdb/compressed.h
@@ -0,0 +1,520 @@
+#pragma once
+
+#include <util/stream/zlib.h>
+
+#include "microbdb.h"
+#include "safeopen.h"
+
+class TCompressedInputFileManip: public TInputFileManip {
+public:
+ inline i64 GetLength() const {
+ return -1; // Some microbdb logic rely on unknown size of compressed files
+ }
+
+ inline i64 Seek(i64 offset, int whence) {
+ i64 oldPos = DoGetPosition();
+ i64 newPos = offset;
+ switch (whence) {
+ case SEEK_CUR:
+ newPos += oldPos;
+ [[fallthrough]]; // Complier happy. Please fix it!
+ case SEEK_SET:
+ break;
+ default:
+ return -1L;
+ }
+ if (oldPos > newPos) {
+ VerifyRandomAccess();
+ DoSeek(0, SEEK_SET, IsStreamOpen());
+ oldPos = 0;
+ }
+ const size_t bufsize = 1 << 12;
+ char buf[bufsize];
+ for (i64 i = oldPos; i < newPos; i += bufsize)
+ InputStream->Read(buf, (i + (i64)bufsize < newPos) ? bufsize : (size_t)(newPos - i));
+ return newPos;
+ }
+
+ i64 RealSeek(i64 offset, int whence) {
+ InputStream.Destroy();
+ i64 ret = DoSeek(offset, whence, !!CompressedInput);
+ if (ret != -1)
+ DoStreamOpen(DoCreateStream(), true);
+ return ret;
+ }
+
+protected:
+ IInputStream* CreateStream(const TFile& file) override {
+ CompressedInput.Reset(new TUnbufferedFileInput(file));
+ return DoCreateStream();
+ }
+ inline IInputStream* DoCreateStream() {
+ return new TZLibDecompress(CompressedInput.Get(), ZLib::GZip);
+ //return new TLzqDecompress(CompressedInput.Get());
+ }
+ THolder<IInputStream> CompressedInput;
+};
+
+class TCompressedBufferedInputFileManip: public TCompressedInputFileManip {
+protected:
+ IInputStream* CreateStream(const TFile& file) override {
+ CompressedInput.Reset(new TFileInput(file, 0x100000));
+ return DoCreateStream();
+ }
+};
+
+using TCompressedInputPageFile = TInputPageFileImpl<TCompressedInputFileManip>;
+using TCompressedBufferedInputPageFile = TInputPageFileImpl<TCompressedBufferedInputFileManip>;
+
+template <class TVal>
+struct TGzKey {
+ ui64 Offset;
+ TVal Key;
+
+ static const ui32 RecordSig = TVal::RecordSig + 0x50495a47;
+
+ TGzKey() {
+ }
+
+ TGzKey(ui64 offset, const TVal& key)
+ : Offset(offset)
+ , Key(key)
+ {
+ }
+
+ size_t SizeOf() const {
+ if (this)
+ return sizeof(Offset) + ::SizeOf(&Key);
+ else {
+ size_t sizeOfKey = ::SizeOf((TVal*)NULL);
+ return sizeOfKey ? (sizeof(Offset) + sizeOfKey) : 0;
+ }
+ }
+};
+
+template <class TVal>
+class TInZIndexFile: protected TInDatFileImpl<TGzKey<TVal>> {
+ typedef TInDatFileImpl<TGzKey<TVal>> TDatFile;
+ typedef TGzKey<TVal> TGzVal;
+ typedef typename TDatFile::TRecIter TRecIter;
+ typedef typename TRecIter::TPageIter TPageIter;
+
+public:
+ TInZIndexFile()
+ : Index0(nullptr)
+ {
+ }
+
+ int Open(const char* fname, size_t pages = 1, int pagesOrBytes = 1, ui32* gotRecordSig = nullptr) {
+ int ret = TDatFile::Open(fname, pages, pagesOrBytes, gotRecordSig);
+ if (ret)
+ return ret;
+ if (!(Index0 = (TDatPage*)malloc(TPageIter::GetPageSize()))) {
+ TDatFile::Close();
+ return MBDB_NO_MEMORY;
+ }
+ if (SizeOf((TGzVal*)NULL))
+ RecsOnPage = (TPageIter::GetPageSize() - sizeof(TDatPage)) / DatCeil(SizeOf((TGzVal*)NULL));
+ TDatFile::Next();
+ memcpy(Index0, TPageIter::Current(), TPageIter::GetPageSize());
+ return 0;
+ }
+
+ int Close() {
+ free(Index0);
+ Index0 = NULL;
+ return TDatFile::Close();
+ }
+
+ inline int GetError() const {
+ return TDatFile::GetError();
+ }
+
+ int FindKey(const TVal* akey, const typename TExtInfoType<TVal>::TResult* = NULL) {
+ assert(IsOpen());
+ if (!SizeOf((TVal*)NULL))
+ return FindVszKey(akey);
+ int pageno;
+ i64 offset;
+ FindKeyOnPage(pageno, offset, Index0, akey);
+ TDatPage* page = TPageIter::GotoPage(pageno + 1);
+ int num_add = (int)offset;
+ FindKeyOnPage(pageno, offset, page, akey);
+ return pageno + num_add;
+ }
+
+ using TDatFile::IsOpen;
+
+ int FindVszKey(const TVal* akey, const typename TExtInfoType<TVal>::TResult* = NULL) {
+ int pageno;
+ i64 offset;
+ FindVszKeyOnPage(pageno, offset, Index0, akey);
+ TDatPage* page = TPageIter::GotoPage(pageno + 1);
+ int num_add = (int)offset;
+ FindVszKeyOnPage(pageno, offset, page, akey);
+ return pageno + num_add;
+ }
+
+ i64 FindPage(int pageno) {
+ if (!SizeOf((TVal*)NULL))
+ return FindVszPage(pageno);
+ int recsize = DatCeil(SizeOf((TGzVal*)NULL));
+ TDatPage* page = TPageIter::GotoPage(1 + pageno / RecsOnPage);
+ if (!page) // can happen if pageno is beyond EOF
+ return -1;
+ unsigned int localpageno = pageno % RecsOnPage;
+ if (localpageno >= page->RecNum) // can happen if pageno is beyond EOF
+ return -1;
+ TGzVal* v = (TGzVal*)((char*)page + sizeof(TDatPage) + localpageno * recsize);
+ return v->Offset;
+ }
+
+ i64 FindVszPage(int pageno) {
+ TGzVal* cur = (TGzVal*)((char*)Index0 + sizeof(TDatPage));
+ TGzVal* prev = cur;
+ unsigned int n = 0;
+ while (n < Index0->RecNum && cur->Offset <= (unsigned int)pageno) {
+ prev = cur;
+ cur = (TGzVal*)((char*)cur + DatCeil(SizeOf(cur)));
+ n++;
+ }
+ TDatPage* page = TPageIter::GotoPage(n);
+ unsigned int num_add = (unsigned int)(prev->Offset);
+ n = 0;
+ cur = (TGzVal*)((char*)page + sizeof(TDatPage));
+ while (n < page->RecNum && n + num_add < (unsigned int)pageno) {
+ cur = (TGzVal*)((char*)cur + DatCeil(SizeOf(cur)));
+ n++;
+ }
+ if (n == page->RecNum) // can happen if pageno is beyond EOF
+ return -1;
+ return cur->Offset;
+ }
+
+protected:
+ void FindKeyOnPage(int& pageno, i64& offset, TDatPage* page, const TVal* Key) {
+ int left = 0;
+ int right = page->RecNum - 1;
+ int recsize = DatCeil(SizeOf((TGzVal*)NULL));
+ while (left < right) {
+ int middle = (left + right) >> 1;
+ if (((TGzVal*)((char*)page + sizeof(TDatPage) + middle * recsize))->Key < *Key)
+ left = middle + 1;
+ else
+ right = middle;
+ }
+ //borders check (left and right)
+ pageno = (left == 0 || ((TGzVal*)((char*)page + sizeof(TDatPage) + left * recsize))->Key < *Key) ? left : left - 1;
+ offset = ((TGzVal*)((char*)page + sizeof(TDatPage) + pageno * recsize))->Offset;
+ }
+
+ void FindVszKeyOnPage(int& pageno, i64& offset, TDatPage* page, const TVal* key) {
+ TGzVal* cur = (TGzVal*)((char*)page + sizeof(TDatPage));
+ ui32 RecordSig = page->RecNum;
+ i64 tmpoffset = cur->Offset;
+ for (; RecordSig > 0 && cur->Key < *key; --RecordSig) {
+ tmpoffset = cur->Offset;
+ cur = (TGzVal*)((char*)cur + DatCeil(SizeOf(cur)));
+ }
+ int idx = page->RecNum - RecordSig - 1;
+ pageno = (idx >= 0) ? idx : 0;
+ offset = tmpoffset;
+ }
+
+ TDatPage* Index0;
+ int RecsOnPage;
+};
+
+template <class TKey>
+class TCompressedIndexedInputPageFile: public TCompressedInputPageFile {
+public:
+ int GotoPage(int pageno);
+
+protected:
+ TInZIndexFile<TKey> KeyFile;
+};
+
+template <class TVal, class TKey>
+class TDirectCompressedInDatFile: public TDirectInDatFile<TVal, TKey,
+ TInDatFileImpl<TVal, TInputRecordIterator<TVal,
+ TInputPageIterator<TCompressedIndexedInputPageFile<TKey>>>>> {
+};
+
+class TCompressedOutputFileManip: public TOutputFileManip {
+public:
+ inline i64 GetLength() const {
+ return -1; // Some microbdb logic rely on unknown size of compressed files
+ }
+
+ inline i64 Seek(i64 offset, int whence) {
+ i64 oldPos = DoGetPosition();
+ i64 newPos = offset;
+ switch (whence) {
+ case SEEK_CUR:
+ newPos += oldPos;
+ [[fallthrough]]; // Compler happy. Please fix it!
+ case SEEK_SET:
+ break;
+ default:
+ return -1L;
+ }
+ if (oldPos > newPos)
+ return -1L;
+
+ const size_t bufsize = 1 << 12;
+ char buf[bufsize] = {0};
+ for (i64 i = oldPos; i < newPos; i += bufsize)
+ OutputStream->Write(buf, (i + (i64)bufsize < newPos) ? bufsize : (size_t)(newPos - i));
+ return newPos;
+ }
+
+ i64 RealSeek(i64 offset, int whence) {
+ OutputStream.Destroy();
+ i64 ret = DoSeek(offset, whence, !!CompressedOutput);
+ if (ret != -1)
+ DoStreamOpen(DoCreateStream(), true);
+ return ret;
+ }
+
+protected:
+ IOutputStream* CreateStream(const TFile& file) override {
+ CompressedOutput.Reset(new TUnbufferedFileOutput(file));
+ return DoCreateStream();
+ }
+ inline IOutputStream* DoCreateStream() {
+ return new TZLibCompress(CompressedOutput.Get(), ZLib::GZip, 1);
+ }
+ THolder<IOutputStream> CompressedOutput;
+};
+
+class TCompressedBufferedOutputFileManip: public TCompressedOutputFileManip {
+protected:
+ IOutputStream* CreateStream(const TFile& file) override {
+ CompressedOutput.Reset(new TUnbufferedFileOutput(file));
+ return DoCreateStream();
+ }
+ inline IOutputStream* DoCreateStream() {
+ return new TZLibCompress(CompressedOutput.Get(), ZLib::GZip, 1, 0x100000);
+ }
+};
+
+using TCompressedOutputPageFile = TOutputPageFileImpl<TCompressedOutputFileManip>;
+using TCompressedBufferedOutputPageFile = TOutputPageFileImpl<TCompressedBufferedOutputFileManip>;
+
+template <class TVal>
+class TOutZIndexFile: public TOutDatFileImpl<
+ TGzKey<TVal>,
+ TOutputRecordIterator<TGzKey<TVal>, TOutputPageIterator<TOutputPageFile>, TCallbackIndexer>> {
+ typedef TOutDatFileImpl<
+ TGzKey<TVal>,
+ TOutputRecordIterator<TGzKey<TVal>, TOutputPageIterator<TOutputPageFile>, TCallbackIndexer>>
+ TDatFile;
+ typedef TOutZIndexFile<TVal> TMyType;
+ typedef TGzKey<TVal> TGzVal;
+ typedef typename TDatFile::TRecIter TRecIter;
+ typedef typename TRecIter::TPageIter TPageIter;
+ typedef typename TRecIter::TIndexer TIndexer;
+
+public:
+ TOutZIndexFile() {
+ TotalRecNum = 0;
+ TIndexer::SetCallback(this, DispatchCallback);
+ }
+
+ int Open(const char* fname, size_t pagesize, size_t pages, int pagesOrBytes = 1) {
+ int ret = TDatFile::Open(fname, pagesize, pages, pagesOrBytes);
+ if (ret)
+ return ret;
+ if ((ret = TRecIter::GotoPage(1)))
+ TDatFile::Close();
+ return ret;
+ }
+
+ int Close() {
+ TPageIter::Unfreeze();
+ if (TRecIter::RecNum)
+ NextPage(TPageIter::Current());
+ int ret = 0;
+ if (Index0.size() && !(ret = TRecIter::GotoPage(0))) {
+ typename std::vector<TGzVal>::iterator it, end = Index0.end();
+ for (it = Index0.begin(); it != end; ++it)
+ TRecIter::Push(&*it);
+ ret = (TPageIter::GetPageNum() != 0) ? MBDB_PAGE_OVERFLOW : TPageIter::GetError();
+ }
+ Index0.clear();
+ int ret1 = TDatFile::Close();
+ return ret ? ret : ret1;
+ }
+
+protected:
+ int TotalRecNum; // should be enough because we have GotoPage(int)
+ std::vector<TGzVal> Index0;
+
+ void NextPage(const TDatPage* page) {
+ TGzVal* rec = (TGzVal*)((char*)page + sizeof(TDatPage));
+ Index0.push_back(TGzVal(TotalRecNum, rec->Key));
+ TotalRecNum += TRecIter::RecNum;
+ }
+
+ static void DispatchCallback(void* This, const TDatPage* page) {
+ ((TMyType*)This)->NextPage(page);
+ }
+};
+
+template <class TVal, class TKey, class TPageFile = TCompressedOutputPageFile>
+class TOutDirectCompressedFileImpl: public TOutDatFileImpl<
+ TVal,
+ TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TCallbackIndexer>> {
+ typedef TOutDatFileImpl<
+ TVal,
+ TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TCallbackIndexer>>
+ TDatFile;
+ typedef TOutDirectCompressedFileImpl<TVal, TKey, TPageFile> TMyType;
+ typedef typename TDatFile::TRecIter TRecIter;
+ typedef typename TRecIter::TPageIter TPageIter;
+ typedef typename TRecIter::TIndexer TIndexer;
+ typedef TGzKey<TKey> TMyKey;
+ typedef TOutZIndexFile<TKey> TKeyFile;
+
+protected:
+ using TDatFile::Tell;
+
+public:
+ TOutDirectCompressedFileImpl() {
+ TIndexer::SetCallback(this, DispatchCallback);
+ }
+
+ int Open(const char* fname, size_t pagesize, size_t ipagesize = 0) {
+ char iname[FILENAME_MAX];
+ int ret;
+ if (ipagesize == 0)
+ ipagesize = pagesize;
+
+ ret = TDatFile::Open(fname, pagesize, 1, 1);
+ ret = ret ? ret : DatNameToIdx(iname, fname);
+ ret = ret ? ret : KeyFile.Open(iname, ipagesize, 1, 1);
+ if (ret)
+ TDatFile::Close();
+ return ret;
+ }
+
+ int Close() {
+ if (TRecIter::RecNum)
+ NextPage(TPageIter::Current());
+ int ret = KeyFile.Close();
+ int ret1 = TDatFile::Close();
+ return ret1 ? ret1 : ret;
+ }
+
+ int GetError() const {
+ return TDatFile::GetError() ? TDatFile::GetError() : KeyFile.GetError();
+ }
+
+protected:
+ TKeyFile KeyFile;
+
+ void NextPage(const TDatPage* page) {
+ size_t sz = SizeOf((TMyKey*)NULL);
+ TMyKey* rec = KeyFile.Reserve(sz ? sz : MaxSizeOf<TMyKey>());
+ if (rec) {
+ rec->Offset = Tell();
+ rec->Key = *(TVal*)((char*)page + sizeof(TDatPage));
+ KeyFile.ResetDat();
+ }
+ }
+
+ static void DispatchCallback(void* This, const TDatPage* page) {
+ ((TMyType*)This)->NextPage(page);
+ }
+};
+
+template <class TKey>
+int TCompressedIndexedInputPageFile<TKey>::GotoPage(int pageno) {
+ if (Error)
+ return Error;
+
+ Eof = 0;
+
+ i64 offset = KeyFile.FindPage(pageno);
+ if (!offset)
+ return Error = MBDB_BAD_FILE_SIZE;
+
+ if (offset != FileManip.RealSeek(offset, SEEK_SET))
+ Error = MBDB_BAD_FILE_SIZE;
+
+ return Error;
+}
+
+template <typename TVal>
+class TCompressedInDatFile: public TInDatFile<TVal, TCompressedInputPageFile> {
+public:
+ TCompressedInDatFile(const char* name, size_t pages, int pagesOrBytes = 1)
+ : TInDatFile<TVal, TCompressedInputPageFile>(name, pages, pagesOrBytes)
+ {
+ }
+};
+
+template <typename TVal>
+class TCompressedOutDatFile: public TOutDatFile<TVal, TFakeCompression, TCompressedOutputPageFile> {
+public:
+ TCompressedOutDatFile(const char* name, size_t pagesize, size_t pages, int pagesOrBytes = 1)
+ : TOutDatFile<TVal, TFakeCompression, TCompressedOutputPageFile>(name, pagesize, pages, pagesOrBytes)
+ {
+ }
+};
+
+template <typename TVal, typename TKey, typename TPageFile = TCompressedOutputPageFile>
+class TOutDirectCompressedFile: protected TOutDirectCompressedFileImpl<TVal, TKey, TPageFile> {
+ typedef TOutDirectCompressedFileImpl<TVal, TKey, TPageFile> TBase;
+
+public:
+ TOutDirectCompressedFile(const char* name, size_t pagesize, size_t ipagesize = 0)
+ : Name(strdup(name))
+ , PageSize(pagesize)
+ , IdxPageSize(ipagesize)
+ {
+ }
+
+ ~TOutDirectCompressedFile() {
+ Close();
+ free(Name);
+ Name = NULL;
+ }
+
+ void Open(const char* fname) {
+ int ret = TBase::Open(fname, PageSize, IdxPageSize);
+ if (ret)
+ ythrow yexception() << ErrorMessage(ret, "Failed to open output file", fname);
+ free(Name);
+ Name = strdup(fname);
+ }
+
+ void Close() {
+ int ret;
+ if ((ret = TBase::GetError()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error before closing output file", Name);
+ if ((ret = TBase::Close()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error while closing output file", Name);
+ }
+
+ const char* GetName() const {
+ return Name;
+ }
+
+ using TBase::Freeze;
+ using TBase::Push;
+ using TBase::Reserve;
+ using TBase::Unfreeze;
+
+protected:
+ char* Name;
+ size_t PageSize, IdxPageSize;
+};
+
+class TCompressedInterFileTypes {
+public:
+ typedef TCompressedBufferedOutputPageFile TOutPageFile;
+ typedef TCompressedBufferedInputPageFile TInPageFile;
+};
diff --git a/library/cpp/microbdb/extinfo.h b/library/cpp/microbdb/extinfo.h
new file mode 100644
index 0000000000..c8389e783c
--- /dev/null
+++ b/library/cpp/microbdb/extinfo.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#include "header.h"
+
+#include <library/cpp/packedtypes/longs.h>
+
+#include <util/generic/typetraits.h>
+
+#include <library/cpp/microbdb/noextinfo.pb.h>
+
+inline bool operator<(const TNoExtInfo&, const TNoExtInfo&) {
+ return false;
+}
+
+namespace NMicroBDB {
+ Y_HAS_MEMBER(TExtInfo);
+
+ template <class, bool>
+ struct TSelectExtInfo;
+
+ template <class T>
+ struct TSelectExtInfo<T, false> {
+ typedef TNoExtInfo TExtInfo;
+ };
+
+ template <class T>
+ struct TSelectExtInfo<T, true> {
+ typedef typename T::TExtInfo TExtInfo;
+ };
+
+ template <class T>
+ class TExtInfoType {
+ public:
+ static const bool Exists = THasTExtInfo<T>::value;
+ typedef typename TSelectExtInfo<T, Exists>::TExtInfo TResult;
+ };
+
+ Y_HAS_MEMBER(MakeExtKey);
+
+ template <class, class, bool>
+ struct TSelectMakeExtKey;
+
+ template <class TVal, class TKey>
+ struct TSelectMakeExtKey<TVal, TKey, false> {
+ static inline void Make(TKey* to, typename TExtInfoType<TKey>::TResult*, const TVal* from, const typename TExtInfoType<TVal>::TResult*) {
+ *to = *from;
+ }
+ };
+
+ template <class TVal, class TKey>
+ struct TSelectMakeExtKey<TVal, TKey, true> {
+ static inline void Make(TKey* to, typename TExtInfoType<TKey>::TResult* toExt, const TVal* from, const typename TExtInfoType<TVal>::TResult* fromExt) {
+ TVal::MakeExtKey(to, toExt, from, fromExt);
+ }
+ };
+
+ template <typename T>
+ inline size_t SizeOfExt(const T* rec, size_t* /*out*/ extLenSize = nullptr, size_t* /*out*/ extSize = nullptr) {
+ if (!TExtInfoType<T>::Exists) {
+ if (extLenSize)
+ *extLenSize = 0;
+ if (extSize)
+ *extSize = 0;
+ return SizeOf(rec);
+ } else {
+ size_t sz = SizeOf(rec);
+ i64 l;
+ int els = in_long(l, (const char*)rec + sz);
+ if (extLenSize)
+ *extLenSize = static_cast<size_t>(els);
+ if (extSize)
+ *extSize = static_cast<size_t>(l);
+ return sz;
+ }
+ }
+
+ template <class T>
+ bool GetExtInfo(const T* rec, typename TExtInfoType<T>::TResult* extInfo) {
+ Y_VERIFY(TExtInfoType<T>::Exists, "GetExtInfo should only be used with extended records");
+ if (!rec)
+ return false;
+ size_t els;
+ size_t es;
+ size_t s = SizeOfExt(rec, &els, &es);
+ const ui8* raw = (const ui8*)rec + s + els;
+ return extInfo->ParseFromArray(raw, es);
+ }
+
+ template <class T>
+ const ui8* GetExtInfoRaw(const T* rec, size_t* len) {
+ Y_VERIFY(TExtInfoType<T>::Exists, "GetExtInfo should only be used with extended records");
+ if (!rec) {
+ *len = 0;
+ return nullptr;
+ }
+ size_t els;
+ size_t es;
+ size_t s = SizeOfExt(rec, &els, &es);
+ *len = els + es;
+ return (const ui8*)rec + s;
+ }
+
+ // Compares serialized extInfo (e.g. for stable sort)
+ template <class T>
+ int CompareExtInfo(const T* a, const T* b) {
+ Y_VERIFY(TExtInfoType<T>::Exists, "CompareExtInfo should only be used with extended records");
+ size_t elsA, esA;
+ size_t elsB, esB;
+ SizeOfExt(a, &elsA, &esA);
+ SizeOfExt(a, &elsB, &esB);
+ if (esA != esB)
+ return esA - esB;
+ else
+ return memcmp((const ui8*)a + elsA, (const ui8*)b + elsB, esA);
+ }
+
+}
+
+using NMicroBDB::TExtInfoType;
+
+template <class TVal, class TKey>
+struct TMakeExtKey {
+ static const bool Exists = NMicroBDB::THasMakeExtKey<TVal>::value;
+ static inline void Make(TKey* to, typename TExtInfoType<TKey>::TResult* toExt, const TVal* from, const typename TExtInfoType<TVal>::TResult* fromExt) {
+ NMicroBDB::TSelectMakeExtKey<TVal, TKey, Exists>::Make(to, toExt, from, fromExt);
+ }
+};
diff --git a/library/cpp/microbdb/file.cpp b/library/cpp/microbdb/file.cpp
new file mode 100644
index 0000000000..599a7301a0
--- /dev/null
+++ b/library/cpp/microbdb/file.cpp
@@ -0,0 +1,220 @@
+#include "file.h"
+
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/stat.h>
+
+#ifdef _win32_
+#define S_ISREG(x) !!(x & S_IFREG)
+#endif
+
+TFileManipBase::TFileManipBase()
+ : FileBased(true)
+{
+}
+
+i64 TFileManipBase::DoSeek(i64 offset, int whence, bool isStreamOpen) {
+ if (!isStreamOpen)
+ return -1;
+ VerifyRandomAccess();
+ return File.Seek(offset, (SeekDir)whence);
+}
+
+int TFileManipBase::DoFileOpen(const TFile& file) {
+ File = file;
+ SetFileBased(IsFileBased());
+ return (File.IsOpen()) ? 0 : MBDB_OPEN_ERROR;
+}
+
+int TFileManipBase::DoFileClose() {
+ if (File.IsOpen()) {
+ File.Close();
+ return MBDB_ALREADY_INITIALIZED;
+ }
+ return 0;
+}
+
+int TFileManipBase::IsFileBased() const {
+ bool fileBased = true;
+#if defined(_win_)
+#elif defined(_unix_)
+ FHANDLE h = File.GetHandle();
+ struct stat sb;
+ fileBased = false;
+ if (h != INVALID_FHANDLE && !::fstat(h, &sb) && S_ISREG(sb.st_mode)) {
+ fileBased = true;
+ }
+#else
+#error
+#endif
+ return fileBased;
+}
+
+TInputFileManip::TInputFileManip()
+ : InputStream(nullptr)
+{
+}
+
+int TInputFileManip::Open(const char* fname, bool direct) {
+ int ret;
+ return (ret = DoClose()) ? ret : DoStreamOpen(TFile(fname, RdOnly | (direct ? DirectAligned : EOpenMode())));
+}
+
+int TInputFileManip::Open(IInputStream& input) {
+ int ret;
+ return (ret = DoClose()) ? ret : DoStreamOpen(&input);
+}
+
+int TInputFileManip::Open(TAutoPtr<IInputStream> input) {
+ int ret;
+ return (ret = DoClose()) ? ret : DoStreamOpen(input.Release());
+}
+
+int TInputFileManip::Init(const TFile& file) {
+ int ret;
+ if (ret = DoClose())
+ return ret;
+ DoStreamOpen(file);
+ return 0;
+}
+
+int TInputFileManip::Close() {
+ DoClose();
+ return 0;
+}
+
+ssize_t TInputFileManip::Read(void* buf, unsigned len) {
+ if (!IsStreamOpen())
+ return -1;
+ return InputStream->Load(buf, len);
+}
+
+IInputStream* TInputFileManip::CreateStream(const TFile& file) {
+ return new TUnbufferedFileInput(file);
+}
+
+TMappedInputPageFile::TMappedInputPageFile()
+ : Pagesize(0)
+ , Error(0)
+ , Pagenum(0)
+ , Recordsig(0)
+ , Open(false)
+{
+ Term();
+}
+
+TMappedInputPageFile::~TMappedInputPageFile() {
+ Term();
+}
+
+int TMappedInputPageFile::Init(const char* fname, ui32 recsig, ui32* gotRecordSig, bool) {
+ Mappedfile.init(fname);
+ Open = true;
+
+ TDatMetaPage* meta = (TDatMetaPage*)Mappedfile.getData();
+ if (gotRecordSig)
+ *gotRecordSig = meta->RecordSig;
+
+ if (meta->MetaSig != METASIG)
+ Error = MBDB_BAD_METAPAGE;
+ else if (meta->RecordSig != recsig)
+ Error = MBDB_BAD_RECORDSIG;
+
+ if (Error) {
+ Mappedfile.term();
+ return Error;
+ }
+
+ size_t fsize = Mappedfile.getSize();
+ if (fsize < METASIZE)
+ return Error = MBDB_BAD_FILE_SIZE;
+ fsize -= METASIZE;
+ if (fsize % meta->PageSize)
+ return Error = MBDB_BAD_FILE_SIZE;
+ Pagenum = (int)(fsize / meta->PageSize);
+ Pagesize = meta->PageSize;
+ Recordsig = meta->RecordSig;
+ Error = 0;
+ return Error;
+}
+
+int TMappedInputPageFile::Term() {
+ Mappedfile.term();
+ Open = false;
+ return 0;
+}
+
+TOutputFileManip::TOutputFileManip()
+ : OutputStream(nullptr)
+{
+}
+
+int TOutputFileManip::Open(const char* fname, EOpenMode mode) {
+ if (IsStreamOpen()) {
+ return MBDB_ALREADY_INITIALIZED; // should it be closed as TInputFileManip
+ }
+
+ try {
+ if (unlink(fname) && errno != ENOENT) {
+ if (strncmp(fname, "/dev/std", 8))
+ return MBDB_OPEN_ERROR;
+ }
+ TFile file(fname, mode);
+ DoStreamOpen(file);
+ } catch (const TFileError&) {
+ return MBDB_OPEN_ERROR;
+ }
+ return 0;
+}
+
+int TOutputFileManip::Open(IOutputStream& output) {
+ if (IsStreamOpen())
+ return MBDB_ALREADY_INITIALIZED;
+ DoStreamOpen(&output);
+ return 0;
+}
+
+int TOutputFileManip::Open(TAutoPtr<IOutputStream> output) {
+ if (IsStreamOpen())
+ return MBDB_ALREADY_INITIALIZED;
+ DoStreamOpen(output.Release());
+ return 0;
+}
+
+int TOutputFileManip::Init(const TFile& file) {
+ if (IsStreamOpen())
+ return MBDB_ALREADY_INITIALIZED; // should it be closed as TInputFileManip
+ DoStreamOpen(file);
+ return 0;
+}
+
+int TOutputFileManip::Rotate(const char* newfname) {
+ if (!IsStreamOpen()) {
+ return MBDB_NOT_INITIALIZED;
+ }
+
+ try {
+ TFile file(newfname, WrOnly | OpenAlways | TruncExisting | ARW | AWOther);
+ DoClose();
+ DoStreamOpen(file);
+ } catch (const TFileError&) {
+ return MBDB_OPEN_ERROR;
+ }
+ return 0;
+}
+
+int TOutputFileManip::Close() {
+ DoClose();
+ return 0;
+}
+
+int TOutputFileManip::Write(const void* buf, unsigned len) {
+ if (!IsStreamOpen())
+ return -1;
+ OutputStream->Write(buf, len);
+ return len;
+}
+
+IOutputStream* TOutputFileManip::CreateStream(const TFile& file) {
+ return new TUnbufferedFileOutput(file);
+}
diff --git a/library/cpp/microbdb/file.h b/library/cpp/microbdb/file.h
new file mode 100644
index 0000000000..f7c7818375
--- /dev/null
+++ b/library/cpp/microbdb/file.h
@@ -0,0 +1,225 @@
+#pragma once
+
+#include "header.h"
+
+#include <library/cpp/deprecated/mapped_file/mapped_file.h>
+
+#include <util/generic/noncopyable.h>
+#include <util/stream/file.h>
+#include <util/system/filemap.h>
+
+#define FS_BLOCK_SIZE 512
+
+class TFileManipBase {
+protected:
+ TFileManipBase();
+
+ virtual ~TFileManipBase() {
+ }
+
+ i64 DoSeek(i64 offset, int whence, bool isStreamOpen);
+
+ int DoFileOpen(const TFile& file);
+
+ int DoFileClose();
+
+ int IsFileBased() const;
+
+ inline void SetFileBased(bool fileBased) {
+ FileBased = fileBased;
+ }
+
+ inline i64 DoGetPosition() const {
+ Y_ASSERT(FileBased);
+ return File.GetPosition();
+ }
+
+ inline i64 DoGetLength() const {
+ return (FileBased) ? File.GetLength() : -1;
+ }
+
+ inline void VerifyRandomAccess() const {
+ Y_VERIFY(FileBased, "non-file stream can not be accessed randomly");
+ }
+
+ inline i64 GetPosition() const {
+ return (i64)File.GetPosition();
+ }
+
+private:
+ TFile File;
+ bool FileBased;
+};
+
+class TInputFileManip: public TFileManipBase {
+public:
+ using TFileManipBase::GetPosition;
+
+ TInputFileManip();
+
+ int Open(const char* fname, bool direct = false);
+
+ int Open(IInputStream& input);
+
+ int Open(TAutoPtr<IInputStream> input);
+
+ int Init(const TFile& file);
+
+ int Close();
+
+ ssize_t Read(void* buf, unsigned len);
+
+ inline bool IsOpen() const {
+ return IsStreamOpen();
+ }
+
+ inline i64 GetLength() const {
+ return DoGetLength();
+ }
+
+ inline i64 Seek(i64 offset, int whence) {
+ return DoSeek(offset, whence, IsStreamOpen());
+ }
+
+ inline i64 RealSeek(i64 offset, int whence) {
+ return Seek(offset, whence);
+ }
+
+protected:
+ inline bool IsStreamOpen() const {
+ return !!InputStream;
+ }
+
+ inline int DoStreamOpen(IInputStream* input, bool fileBased = false) {
+ InputStream.Reset(input);
+ SetFileBased(fileBased);
+ return 0;
+ }
+
+ inline int DoStreamOpen(const TFile& file) {
+ int ret;
+ return (ret = DoFileOpen(file)) ? ret : DoStreamOpen(CreateStream(file), IsFileBased());
+ }
+
+ virtual IInputStream* CreateStream(const TFile& file);
+
+ inline bool DoClose() {
+ if (IsStreamOpen()) {
+ InputStream.Destroy();
+ return DoFileClose();
+ }
+ return 0;
+ }
+
+ THolder<IInputStream> InputStream;
+};
+
+class TMappedInputPageFile: private TNonCopyable {
+public:
+ TMappedInputPageFile();
+
+ ~TMappedInputPageFile();
+
+ inline int GetError() const {
+ return Error;
+ }
+
+ inline size_t GetPageSize() const {
+ return Pagesize;
+ }
+
+ inline int GetLastPage() const {
+ return Pagenum;
+ }
+
+ inline ui32 GetRecordSig() const {
+ return Recordsig;
+ }
+
+ inline bool IsOpen() const {
+ return Open;
+ }
+
+ inline char* GetData() const {
+ return Open ? (char*)Mappedfile.getData() : nullptr;
+ }
+
+ inline size_t GetSize() const {
+ return Open ? Mappedfile.getSize() : 0;
+ }
+
+protected:
+ int Init(const char* fname, ui32 recsig, ui32* gotRecordSig = nullptr, bool direct = false);
+
+ int Term();
+
+ TMappedFile Mappedfile;
+ size_t Pagesize;
+ int Error;
+ int Pagenum;
+ ui32 Recordsig;
+ bool Open;
+};
+
+class TOutputFileManip: public TFileManipBase {
+public:
+ TOutputFileManip();
+
+ int Open(const char* fname, EOpenMode mode = WrOnly | CreateAlways | ARW | AWOther);
+
+ int Open(IOutputStream& output);
+
+ int Open(TAutoPtr<IOutputStream> output);
+
+ int Init(const TFile& file);
+
+ int Rotate(const char* newfname);
+
+ int Write(const void* buf, unsigned len);
+
+ int Close();
+
+ inline bool IsOpen() const {
+ return IsStreamOpen();
+ }
+
+ inline i64 GetLength() const {
+ return DoGetLength();
+ }
+
+ inline i64 Seek(i64 offset, int whence) {
+ return DoSeek(offset, whence, IsStreamOpen());
+ }
+
+ inline i64 RealSeek(i64 offset, int whence) {
+ return Seek(offset, whence);
+ }
+
+protected:
+ inline bool IsStreamOpen() const {
+ return !!OutputStream;
+ }
+
+ inline int DoStreamOpen(IOutputStream* output, bool fileBased = false) {
+ OutputStream.Reset(output);
+ SetFileBased(fileBased);
+ return 0;
+ }
+
+ inline int DoStreamOpen(const TFile& file) {
+ int ret;
+ return (ret = DoFileOpen(file)) ? ret : DoStreamOpen(CreateStream(file), true);
+ }
+
+ virtual IOutputStream* CreateStream(const TFile& file);
+
+ inline bool DoClose() {
+ if (IsStreamOpen()) {
+ OutputStream.Destroy();
+ return DoFileClose();
+ }
+ return 0;
+ }
+
+ THolder<IOutputStream> OutputStream;
+};
diff --git a/library/cpp/microbdb/hashes.h b/library/cpp/microbdb/hashes.h
new file mode 100644
index 0000000000..bfd113c3ba
--- /dev/null
+++ b/library/cpp/microbdb/hashes.h
@@ -0,0 +1,250 @@
+#pragma once
+
+#include <library/cpp/on_disk/st_hash/static_hash.h>
+#include <util/system/sysstat.h>
+#include <util/stream/mem.h>
+#include <util/string/printf.h>
+#include <library/cpp/deprecated/fgood/fgood.h>
+
+#include "safeopen.h"
+
+/** This file currently implements creation of mappable read-only hash file.
+ Basic usage of these "static hashes" is defined in util/static_hash.h (see docs there).
+ Additional useful wrappers are available in util/static_hash_map.h
+
+ There are two ways to create mappable hash file:
+
+ A) Fill an THashMap/set structure in RAM, then dump it to disk.
+ This is usually done by save_hash_to_file* functions defined in static_hash.h
+ (see description in static_hash.h).
+
+ B) Prepare all data using external sorter, then create hash file straight on disk.
+ This approach is necessary when there isn't enough RAM to hold entire original THashMap.
+ Implemented in this file as TStaticHashBuilder class.
+
+ Current implementation's major drawback is that the size of the hash must be estimated
+ before the hash is built (bucketCount), which is not always possible.
+ Separate implementation with two sort passes is yet to be done.
+
+ Another problem is that maximum stored size of the element (maxRecSize) must also be
+ known in advance, because we use TDatSorterMemo, etc.
+ */
+
+template <class SizeType>
+struct TSthashTmpRec {
+ SizeType HashVal;
+ SizeType RecSize;
+ char Buf[1];
+ size_t SizeOf() const {
+ return &Buf[RecSize] - (char*)this;
+ }
+ bool operator<(const TSthashTmpRec& than) const {
+ return HashVal < than.HashVal;
+ }
+ static const ui32 RecordSig = 20100124 + sizeof(SizeType) - 4;
+};
+
+template <typename T>
+struct TReplaceMerger {
+ T operator()(const T& oldRecord, const T& newRecord) const {
+ Y_UNUSED(oldRecord);
+ return newRecord;
+ }
+};
+
+/** TStaticHashBuilder template parameters:
+ HashType - THashMap map/set type for which we construct corresponding mappable hash;
+ SizeType - type used to store offsets and length in resulting hash;
+ MergerType - type of object to process records with equal key (see TReplaceMerger for example);
+ */
+
+template <class HashType, class SizeType, class MergerType = TReplaceMerger<typename HashType::mapped_type>>
+struct TStaticHashBuilder {
+ const size_t SrtIOPageSz;
+ const size_t WrBufSz;
+ typedef TSthashTmpRec<SizeType> TIoRec;
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, SizeType> TKeySaver;
+ typedef typename HashType::value_type TValueType;
+ typedef typename HashType::mapped_type TMappedType;
+ typedef typename HashType::key_type TKeyType;
+
+ TDatSorterMemo<TIoRec, TCompareByLess> Srt;
+ TBuffer IoRec, CurrentBlockRecs;
+ TKeySaver KeySaver;
+ typename HashType::hasher Hasher;
+ typename HashType::key_equal Equals;
+ MergerType merger;
+ TString HashFileName;
+ TString OurTmpDir;
+ size_t BucketCount;
+ int FreeBits;
+
+ // memSz is the Sorter buffer size;
+ // maxRecSize is the maximum size (as reported by size_for_st) of our record(s)
+ TStaticHashBuilder(size_t memSz, size_t maxRecSize)
+ : SrtIOPageSz((maxRecSize * 16 + 65535) & ~size_t(65535))
+ , WrBufSz(memSz / 16 >= SrtIOPageSz ? memSz / 16 : SrtIOPageSz)
+ , Srt("unused", memSz, SrtIOPageSz, WrBufSz, 0)
+ , IoRec(sizeof(TIoRec) + maxRecSize)
+ , CurrentBlockRecs(sizeof(TIoRec) + maxRecSize)
+ , BucketCount(0)
+ , FreeBits(0)
+ {
+ }
+
+ ~TStaticHashBuilder() {
+ Close();
+ }
+
+ // if tmpDir is supplied, it must exist;
+ // bucketCount should be HashBucketCount() of the (estimated) element count
+ void Open(const char* fname, size_t bucketCount, const char* tmpDir = nullptr) {
+ if (!tmpDir)
+ tmpDir = ~(OurTmpDir = Sprintf("%s.temp", fname));
+ Mkdir(tmpDir, MODE0775);
+ Srt.Open(tmpDir);
+ HashFileName = fname;
+ BucketCount = bucketCount;
+ int bitCount = 0;
+ while (((size_t)1 << bitCount) <= BucketCount && bitCount < int(8 * sizeof(size_t)))
+ ++bitCount;
+ FreeBits = 8 * sizeof(size_t) - bitCount;
+ }
+
+ void Push(const TValueType& rec) {
+ TIoRec* ioRec = MakeIoRec(rec);
+ Srt.Push(ioRec);
+ }
+ TIoRec* MakeIoRec(const TValueType& rec) {
+ TIoRec* ioRec = (TIoRec*)IoRec.Data();
+ size_t mask = (1 << FreeBits) - 1;
+ size_t hash = Hasher(rec.first);
+ ioRec->HashVal = ((hash % BucketCount) << FreeBits) + ((hash / BucketCount) & mask);
+
+ TMemoryOutput output(ioRec->Buf, IoRec.Capacity() - offsetof(TIoRec, Buf));
+ KeySaver.SaveRecord(&output, rec);
+ ioRec->RecSize = output.Buf() - ioRec->Buf;
+ return ioRec;
+ }
+
+ bool Merge(TVector<std::pair<TKeyType, TMappedType>>& records, size_t newRecordSize) {
+ TSthashIterator<const TKeyType, const TMappedType, typename HashType::hasher,
+ typename HashType::key_equal>
+ newPtr(CurrentBlockRecs.End() - newRecordSize);
+ for (size_t i = 0; i < records.size(); ++i) {
+ if (newPtr.KeyEquals(Equals, records[i].first)) {
+ TMappedType oldValue = records[i].second;
+ TMappedType newValue = newPtr.Value();
+ newValue = merger(oldValue, newValue);
+ records[i].second = newValue;
+ return true;
+ }
+ }
+ records.push_back(std::make_pair(newPtr.Key(), newPtr.Value()));
+ return false;
+ }
+
+ void PutRecord(const char* buf, size_t rec_size, TFILEPtr& f, SizeType& cur_off) {
+ f.fsput(buf, rec_size);
+ cur_off += rec_size;
+ }
+
+ void Finish() {
+ Srt.Sort();
+ // We use variant 1.
+ // Variant 1: read sorter once, write records, fseeks to write buckets
+ // (this doesn't allow fname to be stdout)
+ // Variant 2: read sorter (probably temp. file) twice: write buckets, then write records
+ // (this allows fname to be stdout but seems to be longer)
+ TFILEPtr f(HashFileName, "wb");
+ setvbuf(f, nullptr, _IOFBF, WrBufSz);
+ TVector<SizeType> bucketsBuf(WrBufSz, 0);
+ // prepare header (note: this code must be unified with save_stl.h)
+ typedef sthashtable_nvm_sv<typename HashType::hasher, typename HashType::key_equal, SizeType> sv_type;
+ sv_type sv = {Hasher, Equals, BucketCount, 0, 0};
+ // to do: m.b. use just the size of corresponding object?
+ SizeType cur_off = sizeof(sv_type) +
+ (sv.num_buckets + 1) * sizeof(SizeType);
+ SizeType bkt_wroff = sizeof(sv_type), bkt_bufpos = 0, prev_bkt = 0, prev_hash = (SizeType)-1;
+ bucketsBuf[bkt_bufpos++] = cur_off;
+ // if might me better to write many zeroes here
+ f.seek(cur_off, SEEK_SET);
+ TVector<std::pair<TKeyType, TMappedType>> currentBlock;
+ bool emptyFile = true;
+ size_t prevRecSize = 0;
+ // seek forward
+ while (true) {
+ const TIoRec* rec = Srt.Next();
+ if (currentBlock.empty() && !emptyFile) {
+ if (rec && prev_hash == rec->HashVal) {
+ Merge(currentBlock, prevRecSize);
+ } else {
+ // if there is only one record with this hash, don't recode it, just write
+ PutRecord(CurrentBlockRecs.Data(), prevRecSize, f, cur_off);
+ sv.num_elements++;
+ }
+ }
+ if (!rec || prev_hash != rec->HashVal) {
+ // write buckets table
+ for (size_t i = 0; i < currentBlock.size(); ++i) {
+ TIoRec* ioRec = MakeIoRec(TValueType(currentBlock[i]));
+ PutRecord(ioRec->Buf, ioRec->RecSize, f, cur_off);
+ }
+ sv.num_elements += currentBlock.size();
+ currentBlock.clear();
+ CurrentBlockRecs.Clear();
+ if (rec) {
+ prev_hash = rec->HashVal;
+ }
+ }
+ // note: prev_bkt's semantics here is 'cur_bkt - 1', thus we are actually cycling
+ // until cur_bkt == rec->HashVal *inclusively*
+ while (!rec || prev_bkt != (rec->HashVal >> FreeBits)) {
+ bucketsBuf[bkt_bufpos++] = cur_off;
+ if (bkt_bufpos == bucketsBuf.size()) {
+ f.seek(bkt_wroff, SEEK_SET);
+ size_t sz = bkt_bufpos * sizeof(bucketsBuf[0]);
+ if (f.write(bucketsBuf.begin(), 1, sz) != sz)
+ throw yexception() << "could not write " << sz << " bytes to " << ~HashFileName;
+ bkt_wroff += sz;
+ bkt_bufpos = 0;
+ f.seek(cur_off, SEEK_SET);
+ }
+ prev_bkt++;
+ if (!rec) {
+ break;
+ }
+ assert(prev_bkt < BucketCount);
+ }
+ if (!rec) {
+ break;
+ }
+ emptyFile = false;
+ CurrentBlockRecs.Append(rec->Buf, rec->RecSize);
+ if (!currentBlock.empty()) {
+ Merge(currentBlock, rec->RecSize);
+ } else {
+ prevRecSize = rec->RecSize;
+ }
+ }
+ // finish buckets table
+ f.seek(bkt_wroff, SEEK_SET);
+ size_t sz = bkt_bufpos * sizeof(bucketsBuf[0]);
+ if (sz && f.write(bucketsBuf.begin(), 1, sz) != sz)
+ throw yexception() << "could not write " << sz << " bytes to " << ~HashFileName;
+ bkt_wroff += sz;
+ for (; prev_bkt < BucketCount; prev_bkt++)
+ f.fput(cur_off);
+ // finally write header
+ sv.data_end_off = cur_off;
+ f.seek(0, SEEK_SET);
+ f.fput(sv);
+ f.close();
+ }
+
+ void Close() {
+ Srt.Close();
+ if (+OurTmpDir)
+ rmdir(~OurTmpDir);
+ }
+};
diff --git a/library/cpp/microbdb/header.cpp b/library/cpp/microbdb/header.cpp
new file mode 100644
index 0000000000..f4511d6fb6
--- /dev/null
+++ b/library/cpp/microbdb/header.cpp
@@ -0,0 +1,91 @@
+#include "header.h"
+
+#include <util/stream/output.h>
+#include <util/stream/format.h>
+
+TString ToString(EMbdbErrors error) {
+ TString ret;
+ switch (error) {
+ case MBDB_ALREADY_INITIALIZED:
+ ret = "already initialized";
+ break;
+ case MBDB_NOT_INITIALIZED:
+ ret = "not initialized";
+ break;
+ case MBDB_BAD_DESCRIPTOR:
+ ret = "bad descriptor";
+ break;
+ case MBDB_OPEN_ERROR:
+ ret = "open error";
+ break;
+ case MBDB_READ_ERROR:
+ ret = "read error";
+ break;
+ case MBDB_WRITE_ERROR:
+ ret = "write error";
+ break;
+ case MBDB_CLOSE_ERROR:
+ ret = "close error";
+ break;
+ case MBDB_EXPECTED_EOF:
+ ret = "expected eof";
+ break;
+ case MBDB_UNEXPECTED_EOF:
+ ret = "unxepected eof";
+ break;
+ case MBDB_BAD_FILENAME:
+ ret = "bad filename";
+ break;
+ case MBDB_BAD_METAPAGE:
+ ret = "bad metapage";
+ break;
+ case MBDB_BAD_RECORDSIG:
+ ret = "bad recordsig";
+ break;
+ case MBDB_BAD_FILE_SIZE:
+ ret = "bad file size";
+ break;
+ case MBDB_BAD_PAGESIG:
+ ret = "bad pagesig";
+ break;
+ case MBDB_BAD_PAGESIZE:
+ ret = "bad pagesize";
+ break;
+ case MBDB_BAD_PARM:
+ ret = "bad parm";
+ break;
+ case MBDB_BAD_SYNC:
+ ret = "bad sync";
+ break;
+ case MBDB_PAGE_OVERFLOW:
+ ret = "page overflow";
+ break;
+ case MBDB_NO_MEMORY:
+ ret = "no memory";
+ break;
+ case MBDB_MEMORY_LEAK:
+ ret = "memory leak";
+ break;
+ case MBDB_NOT_SUPPORTED:
+ ret = "not supported";
+ break;
+ default:
+ ret = "unknown";
+ break;
+ }
+ return ret;
+}
+
+TString ErrorMessage(int error, const TString& text, const TString& path, ui32 recordSig, ui32 gotRecordSig) {
+ TStringStream str;
+ str << text;
+ if (path.size())
+ str << " '" << path << "'";
+ str << ": " << ToString(static_cast<EMbdbErrors>(error));
+ if (recordSig && (!gotRecordSig || recordSig != gotRecordSig))
+ str << ". Expected RecordSig: " << Hex(recordSig, HF_ADDX);
+ if (recordSig && gotRecordSig && recordSig != gotRecordSig)
+ str << ", got: " << Hex(gotRecordSig, HF_ADDX);
+ str << ". Last system error text: " << LastSystemErrorText();
+ return str.Str();
+}
diff --git a/library/cpp/microbdb/header.h b/library/cpp/microbdb/header.h
new file mode 100644
index 0000000000..0951d610ea
--- /dev/null
+++ b/library/cpp/microbdb/header.h
@@ -0,0 +1,159 @@
+#pragma once
+
+#include <util/system/defaults.h>
+#include <util/generic/typetraits.h>
+#include <util/generic/string.h>
+#include <util/str_stl.h>
+
+#include <stdio.h>
+
+#define METASIZE (1u << 12)
+#define METASIG 0x12345678u
+#define PAGESIG 0x87654321u
+
+enum EMbdbErrors {
+ MBDB_ALREADY_INITIALIZED = 200,
+ MBDB_NOT_INITIALIZED = 201,
+ MBDB_BAD_DESCRIPTOR = 202,
+ MBDB_OPEN_ERROR = 203,
+ MBDB_READ_ERROR = 204,
+ MBDB_WRITE_ERROR = 205,
+ MBDB_CLOSE_ERROR = 206,
+ MBDB_EXPECTED_EOF = 207,
+ MBDB_UNEXPECTED_EOF = 208,
+ MBDB_BAD_FILENAME = 209,
+ MBDB_BAD_METAPAGE = 210,
+ MBDB_BAD_RECORDSIG = 211,
+ MBDB_BAD_FILE_SIZE = 212,
+ MBDB_BAD_PAGESIG = 213,
+ MBDB_BAD_PAGESIZE = 214,
+ MBDB_BAD_PARM = 215,
+ MBDB_BAD_SYNC = 216,
+ MBDB_PAGE_OVERFLOW = 217,
+ MBDB_NO_MEMORY = 218,
+ MBDB_MEMORY_LEAK = 219,
+ MBDB_NOT_SUPPORTED = 220
+};
+
+TString ToString(EMbdbErrors error);
+TString ErrorMessage(int error, const TString& text, const TString& path = TString(), ui32 recordSig = 0, ui32 gotRecordSig = 0);
+
+enum EPageFormat {
+ MBDB_FORMAT_RAW = 0,
+ MBDB_FORMAT_COMPRESSED = 1,
+ MBDB_FORMAT_NULL = 255
+};
+
+enum ECompressionAlgorithm {
+ MBDB_COMPRESSION_ZLIB = 1,
+ MBDB_COMPRESSION_FASTLZ = 2,
+ MBDB_COMPRESSION_SNAPPY = 3
+};
+
+struct TDatMetaPage {
+ ui32 MetaSig;
+ ui32 RecordSig;
+ ui32 PageSize;
+};
+
+struct TDatPage {
+ ui32 RecNum; //!< number of records on this page
+ ui32 PageSig;
+ ui32 Format : 2; //!< one of EPageFormat
+ ui32 Reserved : 30;
+};
+
+/// Additional page header with compression info
+struct TCompressedPage {
+ ui32 BlockCount;
+ ui32 Algorithm : 4;
+ ui32 Version : 4;
+ ui32 Reserved : 24;
+};
+
+namespace NMicroBDB {
+ /// Header of compressed block
+ struct TCompressedHeader {
+ ui32 Compressed;
+ ui32 Original; /// original size of block
+ ui32 Count; /// number of records in block
+ ui32 Reserved;
+ };
+
+ Y_HAS_MEMBER(AssertValid);
+
+ template <typename T, bool TVal>
+ struct TAssertValid {
+ void operator()(const T*) {
+ }
+ };
+
+ template <typename T>
+ struct TAssertValid<T, true> {
+ void operator()(const T* rec) {
+ return rec->AssertValid();
+ }
+ };
+
+ template <typename T>
+ void AssertValid(const T* rec) {
+ return NMicroBDB::TAssertValid<T, NMicroBDB::THasAssertValid<T>::value>()(rec);
+ }
+
+ Y_HAS_MEMBER(SizeOf);
+
+ template <typename T, bool TVal>
+ struct TGetSizeOf;
+
+ template <typename T>
+ struct TGetSizeOf<T, true> {
+ size_t operator()(const T* rec) {
+ return rec->SizeOf();
+ }
+ };
+
+ template <typename T>
+ struct TGetSizeOf<T, false> {
+ size_t operator()(const T*) {
+ return sizeof(T);
+ }
+ };
+
+ inline char* GetFirstRecord(const TDatPage* page) {
+ switch (page->Format) {
+ case MBDB_FORMAT_RAW:
+ return (char*)page + sizeof(TDatPage);
+ case MBDB_FORMAT_COMPRESSED:
+ // Первая запись на сжатой странице сохраняется несжатой
+ // сразу же после всех заголовков.
+ // Алгоритм сохранения смотреть в TOutputRecordIterator::FlushBuffer
+ return (char*)page + sizeof(TDatPage) + sizeof(TCompressedPage) + sizeof(NMicroBDB::TCompressedHeader);
+ }
+ return (char*)nullptr;
+ }
+}
+
+template <typename T>
+size_t SizeOf(const T* rec) {
+ return NMicroBDB::TGetSizeOf<T, NMicroBDB::THasSizeOf<T>::value>()(rec);
+}
+
+template <typename T>
+size_t MaxSizeOf() {
+ return sizeof(T);
+}
+
+static inline int DatNameToIdx(char iname[/*FILENAME_MAX*/], const char* dname) {
+ if (!dname || !*dname)
+ return MBDB_BAD_FILENAME;
+ const char* ptr;
+ if (!(ptr = strrchr(dname, '/')))
+ ptr = dname;
+ if (!(ptr = strrchr(ptr, '.')))
+ ptr = strchr(dname, 0);
+ if (ptr - dname > FILENAME_MAX - 5)
+ return MBDB_BAD_FILENAME;
+ memcpy(iname, dname, ptr - dname);
+ strcpy(iname + (ptr - dname), ".idx");
+ return 0;
+}
diff --git a/library/cpp/microbdb/heap.h b/library/cpp/microbdb/heap.h
new file mode 100644
index 0000000000..ef5a53534c
--- /dev/null
+++ b/library/cpp/microbdb/heap.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "header.h"
+#include "extinfo.h"
+
+#include <util/generic/vector.h>
+
+#include <errno.h>
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// Default comparator
+template <class TVal>
+struct TCompareByLess {
+ inline bool operator()(const TVal* a, const TVal* b) const {
+ return TLess<TVal>()(*a, *b);
+ }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <class TVal, class TIterator, class TCompare = TCompareByLess<TVal>>
+class THeapIter {
+public:
+ int Init(TIterator** iters, int count) {
+ Term();
+ if (!count)
+ return 0;
+ if (!(Heap = (TIterator**)malloc(count * sizeof(TIterator*))))
+ return ENOMEM;
+
+ Count = count;
+ count = 0;
+ while (count < Count)
+ if (count && !(*iters)->Next()) { //here first TIterator is NOT initialized!
+ Count--;
+ iters++;
+ } else {
+ Heap[count++] = *iters++;
+ }
+ count = Count / 2;
+ while (--count > 0) //Heap[0] is not changed!
+ Sift(count, Count); //do not try to replace this code by make_heap
+ return 0;
+ }
+
+ int Init(TIterator* iters, int count) {
+ TVector<TIterator*> a(count);
+ for (int i = 0; i < count; ++i)
+ a[i] = &iters[i];
+ return Init(&a[0], count);
+ }
+
+ THeapIter()
+ : Heap(nullptr)
+ , Count(0)
+ {
+ }
+
+ THeapIter(TIterator* a, TIterator* b)
+ : Heap(nullptr)
+ , Count(0)
+ {
+ TIterator* arr[] = {a, b};
+ if (Init(arr, 2))
+ ythrow yexception() << "can't Init THeapIter";
+ }
+
+ THeapIter(TVector<TIterator>& v)
+ : Heap(nullptr)
+ , Count(0)
+ {
+ if (Init(&v[0], v.size())) {
+ ythrow yexception() << "can't Init THeapIter";
+ }
+ }
+
+ ~THeapIter() {
+ Term();
+ }
+
+ inline const TVal* Current() const {
+ if (!Count)
+ return nullptr;
+ return (*Heap)->Current();
+ }
+
+ inline const TIterator* CurrentIter() const {
+ return *Heap;
+ }
+
+ //for ends of last file will use Heap[0] = Heap[0] ! and
+ //returns Current of eof so Current of eof MUST return NULL
+ //possible this is bug and need fixing
+ const TVal* Next() {
+ if (!Count)
+ return nullptr;
+ if (!(*Heap)->Next()) //on first call unitialized first TIterator
+ *Heap = Heap[--Count]; //will be correctly initialized
+
+ if (Count == 2) {
+ if (TCompare()(Heap[1]->Current(), Heap[0]->Current()))
+ DoSwap(Heap[1], Heap[0]);
+ } else
+ Sift(0, Count);
+
+ return Current();
+ }
+
+ inline bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const {
+ return (*Heap)->GetExtInfo(extInfo);
+ }
+
+ inline const ui8* GetExtInfoRaw(size_t* len) const {
+ return (*Heap)->GetExtInfoRaw(len);
+ }
+
+ void Term() {
+ ::free(Heap);
+ Heap = nullptr;
+ Count = 0;
+ }
+
+protected:
+ void Sift(int node, int end) {
+ TIterator* x = Heap[node];
+ int son;
+ for (son = 2 * node + 1; son < end; node = son, son = 2 * node + 1) {
+ if (son < (end - 1) && TCompare()(Heap[son + 1]->Current(), Heap[son]->Current()))
+ son++;
+ if (TCompare()(Heap[son]->Current(), x->Current()))
+ Heap[node] = Heap[son];
+ else
+ break;
+ }
+ Heap[node] = x;
+ }
+
+ TIterator** Heap;
+ int Count;
+};
+
+///////////////////////////////////////////////////////////////////////////////
diff --git a/library/cpp/microbdb/input.h b/library/cpp/microbdb/input.h
new file mode 100644
index 0000000000..a214ba6e8a
--- /dev/null
+++ b/library/cpp/microbdb/input.h
@@ -0,0 +1,1027 @@
+#pragma once
+
+#include "header.h"
+#include "file.h"
+#include "reader.h"
+
+#include <util/system/maxlen.h>
+#include <util/system/event.h>
+#include <util/system/thread.h>
+
+#include <thread>
+
+#include <sys/uio.h>
+
+#include <errno.h>
+
+template <class TFileManip>
+inline ssize_t Readv(TFileManip& fileManip, const struct iovec* iov, int iovcnt) {
+ ssize_t read_count = 0;
+ for (int n = 0; n < iovcnt; n++) {
+ ssize_t last_read = fileManip.Read(iov[n].iov_base, iov[n].iov_len);
+ if (last_read < 0)
+ return -1;
+ read_count += last_read;
+ }
+ return read_count;
+}
+
+template <class TVal, typename TBasePageIter>
+class TInputRecordIterator: public TBasePageIter {
+ typedef THolder<NMicroBDB::IBasePageReader<TVal>> TReaderHolder;
+
+public:
+ typedef TBasePageIter TPageIter;
+
+ TInputRecordIterator() {
+ Init();
+ }
+
+ ~TInputRecordIterator() {
+ Term();
+ }
+
+ const TVal* Current() const {
+ return Rec;
+ }
+
+ bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const {
+ if (!Rec)
+ return false;
+ return Reader->GetExtInfo(extInfo);
+ }
+
+ const ui8* GetExtInfoRaw(size_t* len) const {
+ if (!Rec)
+ return nullptr;
+ return Reader->GetExtInfoRaw(len);
+ }
+
+ size_t GetRecSize() const {
+ return Reader->GetRecSize();
+ }
+
+ size_t GetExtSize() const {
+ return Reader->GetExtSize();
+ }
+
+ const TVal* Next() {
+ if (RecNum)
+ --RecNum;
+ else {
+ TDatPage* page = TPageIter::Next();
+ if (!page) {
+ if (TPageIter::IsFrozen() && Reader.Get())
+ Reader->SetClearFlag();
+ return Rec = nullptr;
+ } else if (!!SelectReader())
+ return Rec = nullptr;
+ RecNum = TPageIter::Current()->RecNum - 1;
+ }
+ return Rec = Reader->Next();
+ }
+
+ // Skip(0) == Current(); Skip(1) == Next()
+ const TVal* Skip(int& num) {
+ // Y_ASSERT(num >= 0); ? otherwise it gets into infinite loop
+ while (num > RecNum) {
+ num -= RecNum + 1;
+ if (!TPageIter::Next() || !!SelectReader()) {
+ RecNum = 0;
+ return Rec = nullptr;
+ }
+ RecNum = TPageIter::Current()->RecNum - 1;
+ Rec = Reader->Next();
+ }
+ ++num;
+ while (--num)
+ Next();
+ return Rec;
+ }
+
+ // begin reading from next page
+ void Reset() {
+ Rec = NULL;
+ RecNum = 0;
+ if (Reader.Get())
+ Reader->Reset();
+ }
+
+protected:
+ int Init() {
+ Rec = nullptr;
+ RecNum = 0;
+ Format = MBDB_FORMAT_NULL;
+ return 0;
+ }
+
+ int Term() {
+ Reader.Reset(nullptr);
+ Format = MBDB_FORMAT_NULL;
+ Rec = nullptr;
+ RecNum = 0;
+ return 0;
+ }
+
+ const TVal* GotoPage(int pageno) {
+ if (!TPageIter::GotoPage(pageno) || !!SelectReader())
+ return Rec = nullptr;
+ RecNum = TPageIter::Current()->RecNum - 1;
+ return Rec = Reader->Next();
+ }
+
+ int SelectReader() {
+ if (!TPageIter::Current())
+ return MBDB_UNEXPECTED_EOF;
+ if (ui32(Format) != TPageIter::Current()->Format) {
+ switch (TPageIter::Current()->Format) {
+ case MBDB_FORMAT_RAW:
+ Reader.Reset(new NMicroBDB::TRawPageReader<TVal, TPageIter>(this));
+ break;
+ case MBDB_FORMAT_COMPRESSED:
+ Reader.Reset(new NMicroBDB::TCompressedReader<TVal, TPageIter>(this));
+ break;
+ default:
+ return MBDB_NOT_SUPPORTED;
+ }
+ Format = EPageFormat(TPageIter::Current()->Format);
+ } else {
+ Y_ASSERT(Reader.Get() != nullptr);
+ Reader->Reset();
+ }
+ return 0;
+ }
+
+ const TVal* Rec;
+ TReaderHolder Reader;
+ int RecNum; //!< number of records on the current page after the current record
+ EPageFormat Format;
+};
+
+template <class TBaseReader>
+class TInputPageIterator: public TBaseReader {
+public:
+ typedef TBaseReader TReader;
+
+ TInputPageIterator()
+ : Buf(nullptr)
+ {
+ Term();
+ }
+
+ ~TInputPageIterator() {
+ Term();
+ }
+
+ TDatPage* Current() {
+ return CurPage;
+ }
+
+ int Freeze() {
+ return (Frozen = (PageNum == -1) ? 0 : PageNum);
+ }
+
+ void Unfreeze() {
+ Frozen = -1;
+ }
+
+ inline int IsFrozen() const {
+ return Frozen + 1;
+ }
+
+ inline size_t GetPageSize() const {
+ return TReader::GetPageSize();
+ }
+
+ inline int GetPageNum() const {
+ return PageNum;
+ }
+
+ inline int IsEof() const {
+ return Eof;
+ }
+
+ TDatPage* Next() {
+ if (PageNum >= Maxpage && ReadBuf()) {
+ Eof = Eof ? Eof : TReader::IsEof();
+ return CurPage = nullptr;
+ }
+ return CurPage = (TDatPage*)(Buf + ((++PageNum) % Bufpages) * GetPageSize());
+ }
+
+ TDatPage* GotoPage(int pageno) {
+ if (pageno <= Maxpage && pageno >= (Maxpage - Pages + 1)) {
+ PageNum = pageno;
+ return CurPage = (TDatPage*)(Buf + (PageNum % Bufpages) * GetPageSize());
+ }
+ if (IsFrozen() || TReader::GotoPage(pageno))
+ return nullptr;
+ Maxpage = PageNum = pageno - 1;
+ Eof = 0;
+ return Next();
+ }
+
+protected:
+ int Init(size_t pages, int pagesOrBytes) {
+ Term();
+ if (pagesOrBytes == -1)
+ Bufpages = TReader::GetLastPage();
+ else if (pagesOrBytes)
+ Bufpages = pages;
+ else
+ Bufpages = pages / GetPageSize();
+ if (!TReader::GetLastPage()) {
+ Bufpages = 0;
+ assert(Eof == 1);
+ return 0;
+ }
+ int lastPage = TReader::GetLastPage();
+ if (lastPage >= 0)
+ Bufpages = (int)Min(lastPage, Bufpages);
+ Bufpages = Max(2, Bufpages);
+ Eof = 0;
+ ABuf.Alloc(Bufpages * GetPageSize());
+ return (Buf = ABuf.Begin()) ? 0 : ENOMEM;
+ // return (Buf = (char*)malloc(Bufpages * GetPageSize())) ? 0 : ENOMEM;
+ }
+
+ int Term() {
+ // free(Buf);
+ ABuf.Dealloc();
+ Buf = nullptr;
+ Maxpage = PageNum = Frozen = -1;
+ Bufpages = 0;
+ Pages = 0;
+ Eof = 1;
+ CurPage = nullptr;
+ return 0;
+ }
+
+ int ReadBuf() {
+ int nvec;
+ iovec vec[2];
+ int maxpage = (Frozen == -1 ? Maxpage + 1 : Frozen) + Bufpages - 1;
+ int minpage = Maxpage + 1;
+ if (maxpage < minpage)
+ return EAGAIN;
+ minpage %= Bufpages;
+ maxpage %= Bufpages;
+ if (maxpage < minpage) {
+ vec[0].iov_base = Buf + GetPageSize() * minpage;
+ vec[0].iov_len = GetPageSize() * (Bufpages - minpage);
+ vec[1].iov_base = Buf;
+ vec[1].iov_len = GetPageSize() * (maxpage + 1);
+ nvec = 2;
+ } else {
+ vec[0].iov_base = Buf + GetPageSize() * minpage;
+ vec[0].iov_len = GetPageSize() * (maxpage - minpage + 1);
+ nvec = 1;
+ }
+ TReader::ReadPages(vec, nvec, &Pages);
+ Maxpage += Pages;
+ return !Pages;
+ }
+
+ int Maxpage, PageNum, Frozen, Bufpages, Eof, Pages;
+ TDatPage* CurPage;
+ // TMappedArray<char> ABuf;
+ TMappedAllocation ABuf;
+ char* Buf;
+};
+
+template <class TBaseReader>
+class TInputPageIteratorMT: public TBaseReader {
+public:
+ typedef TBaseReader TReader;
+
+ TInputPageIteratorMT()
+ : CurBuf(0)
+ , CurReadBuf(0)
+ , Buf(nullptr)
+ {
+ Term();
+ }
+
+ ~TInputPageIteratorMT() {
+ Term();
+ }
+
+ TDatPage* Current() {
+ return CurPage;
+ }
+
+ int Freeze() {
+ return (Frozen = (PageNum == -1) ? 0 : PageNum);
+ }
+
+ void Unfreeze() {
+ Frozen = -1;
+ }
+
+ inline int IsFrozen() const {
+ return Frozen + 1;
+ }
+
+ inline size_t GetPageSize() const {
+ return TReader::GetPageSize();
+ }
+
+ inline int GetPageNum() const {
+ return PageNum;
+ }
+
+ inline int IsEof() const {
+ return Eof;
+ }
+
+ TDatPage* Next() {
+ if (Eof)
+ return CurPage = nullptr;
+ if (PageNum >= Maxpage && ReadBuf()) {
+ Eof = Eof ? Eof : TReader::IsEof();
+ return CurPage = nullptr;
+ }
+ return CurPage = (TDatPage*)(Buf + ((++PageNum) % Bufpages) * GetPageSize());
+ }
+
+ TDatPage* GotoPage(int pageno) {
+ if (pageno <= Maxpage && pageno >= (Maxpage - Pages + 1)) {
+ PageNum = pageno;
+ return CurPage = (TDatPage*)(Buf + (PageNum % Bufpages) * GetPageSize());
+ }
+ if (IsFrozen() || TReader::GotoPage(pageno))
+ return nullptr;
+ Maxpage = PageNum = pageno - 1;
+ Eof = 0;
+ return Next();
+ }
+
+ void ReadPages() {
+ // fprintf(stderr, "ReadPages started\n");
+ bool eof = false;
+ while (!eof) {
+ QEvent[CurBuf].Wait();
+ if (Finish)
+ return;
+ int pages = ReadCurBuf(Bufs[CurBuf]);
+ PagesM[CurBuf] = pages;
+ eof = !pages;
+ AEvent[CurBuf].Signal();
+ CurBuf ^= 1;
+ }
+ }
+
+protected:
+ int Init(size_t pages, int pagesOrBytes) {
+ Term();
+ if (pagesOrBytes == -1)
+ Bufpages = TReader::GetLastPage();
+ else if (pagesOrBytes)
+ Bufpages = pages;
+ else
+ Bufpages = pages / GetPageSize();
+ if (!TReader::GetLastPage()) {
+ Bufpages = 0;
+ assert(Eof == 1);
+ return 0;
+ }
+ int lastPage = TReader::GetLastPage();
+ if (lastPage >= 0)
+ Bufpages = (int)Min(lastPage, Bufpages);
+ Bufpages = Max(2, Bufpages);
+ Eof = 0;
+ ABuf.Alloc(Bufpages * GetPageSize() * 2);
+ Bufs[0] = ABuf.Begin();
+ Bufs[1] = Bufs[0] + Bufpages * GetPageSize();
+ // return (Buf = (char*)malloc(Bufpages * GetPageSize())) ? 0 : ENOMEM;
+ Finish = false;
+ ReadThread = std::thread([this]() {
+ TThread::SetCurrentThreadName("DatReader");
+ ReadPages();
+ });
+ QEvent[0].Signal();
+ return Bufs[0] ? 0 : ENOMEM;
+ }
+
+ void StopThread() {
+ Finish = true;
+ QEvent[0].Signal();
+ QEvent[1].Signal();
+ ReadThread.join();
+ }
+
+ int Term() {
+ // free(Buf);
+ if (ReadThread.joinable())
+ StopThread();
+ ABuf.Dealloc();
+ Buf = nullptr;
+ Bufs[0] = nullptr;
+ Bufs[1] = nullptr;
+ Maxpage = MaxpageR = PageNum = Frozen = -1;
+ Bufpages = 0;
+ Pages = 0;
+ Eof = 1;
+ CurPage = nullptr;
+ return 0;
+ }
+
+ int ReadCurBuf(char* buf) {
+ int nvec;
+ iovec vec[2];
+ int maxpage = (Frozen == -1 ? MaxpageR + 1 : Frozen) + Bufpages - 1;
+ int minpage = MaxpageR + 1;
+ if (maxpage < minpage)
+ return EAGAIN;
+ minpage %= Bufpages;
+ maxpage %= Bufpages;
+ if (maxpage < minpage) {
+ vec[0].iov_base = buf + GetPageSize() * minpage;
+ vec[0].iov_len = GetPageSize() * (Bufpages - minpage);
+ vec[1].iov_base = buf;
+ vec[1].iov_len = GetPageSize() * (maxpage + 1);
+ nvec = 2;
+ } else {
+ vec[0].iov_base = buf + GetPageSize() * minpage;
+ vec[0].iov_len = GetPageSize() * (maxpage - minpage + 1);
+ nvec = 1;
+ }
+ int pages;
+ TReader::ReadPages(vec, nvec, &pages);
+ MaxpageR += pages;
+ return pages;
+ }
+
+ int ReadBuf() {
+ QEvent[CurReadBuf ^ 1].Signal();
+ AEvent[CurReadBuf].Wait();
+ Buf = Bufs[CurReadBuf];
+ Maxpage += (Pages = PagesM[CurReadBuf]);
+ CurReadBuf ^= 1;
+ return !Pages;
+ }
+
+ int Maxpage, MaxpageR, PageNum, Frozen, Bufpages, Eof, Pages;
+ TDatPage* CurPage;
+ // TMappedArray<char> ABuf;
+ ui32 CurBuf;
+ ui32 CurReadBuf;
+ TMappedAllocation ABuf;
+ char* Buf;
+ char* Bufs[2];
+ ui32 PagesM[2];
+ TAutoEvent QEvent[2];
+ TAutoEvent AEvent[2];
+ std::thread ReadThread;
+ bool Finish;
+};
+
+template <typename TFileManip>
+class TInputPageFileImpl: private TNonCopyable {
+protected:
+ TFileManip FileManip;
+
+public:
+ TInputPageFileImpl()
+ : Pagesize(0)
+ , Fd(-1)
+ , Eof(1)
+ , Error(0)
+ , Pagenum(0)
+ , Recordsig(0)
+ {
+ Term();
+ }
+
+ ~TInputPageFileImpl() {
+ Term();
+ }
+
+ inline int IsEof() const {
+ return Eof;
+ }
+
+ inline int GetError() const {
+ return Error;
+ }
+
+ inline size_t GetPageSize() const {
+ return Pagesize;
+ }
+
+ inline int GetLastPage() const {
+ return Pagenum;
+ }
+
+ inline ui32 GetRecordSig() const {
+ return Recordsig;
+ }
+
+ inline bool IsOpen() const {
+ return FileManip.IsOpen();
+ }
+
+protected:
+ int Init(const char* fname, ui32 recsig, ui32* gotrecsig = nullptr, bool direct = false) {
+ Error = FileManip.Open(fname, direct);
+ return Error ? Error : Init(TFile(), recsig, gotrecsig);
+ }
+
+ int Init(const TFile& file, ui32 recsig, ui32* gotrecsig = nullptr) {
+ if (!file.IsOpen() && !FileManip.IsOpen())
+ return MBDB_NOT_INITIALIZED;
+ if (file.IsOpen() && FileManip.IsOpen())
+ return MBDB_ALREADY_INITIALIZED;
+ if (file.IsOpen()) {
+ Error = FileManip.Init(file);
+ if (Error)
+ return Error;
+ }
+
+ // TArrayHolder<ui8> buf(new ui8[METASIZE + FS_BLOCK_SIZE]);
+ // ui8* ptr = (buf.Get() + FS_BLOCK_SIZE - ((ui64)buf.Get() & (FS_BLOCK_SIZE - 1)));
+ TMappedArray<ui8> buf;
+ buf.Create(METASIZE);
+ ui8* ptr = &buf[0];
+ TDatMetaPage* meta = (TDatMetaPage*)ptr;
+ ssize_t size = METASIZE;
+ ssize_t ret;
+ while (size && (ret = FileManip.Read(ptr, (unsigned)size)) > 0) {
+ Y_ASSERT(ret <= size);
+ size -= ret;
+ ptr += ret;
+ }
+ if (size) {
+ FileManip.Close();
+ return Error = MBDB_BAD_METAPAGE;
+ }
+ if (gotrecsig)
+ *gotrecsig = meta->RecordSig;
+ return Init(TFile(), meta, recsig);
+ }
+
+ int Init(TAutoPtr<IInputStream> input, ui32 recsig, ui32* gotrecsig = nullptr) {
+ if (!input && !FileManip.IsOpen())
+ return MBDB_NOT_INITIALIZED;
+ if (FileManip.IsOpen())
+ return MBDB_ALREADY_INITIALIZED;
+
+ Error = FileManip.Open(input);
+ if (Error)
+ return Error;
+
+ TArrayHolder<ui8> buf(new ui8[METASIZE]);
+ ui8* ptr = buf.Get();
+ ssize_t size = METASIZE;
+ ssize_t ret;
+ while (size && (ret = FileManip.Read(ptr, (unsigned)size)) > 0) {
+ Y_ASSERT(ret <= size);
+ size -= ret;
+ ptr += ret;
+ }
+ if (size) {
+ FileManip.Close();
+ return Error = MBDB_BAD_METAPAGE;
+ }
+ TDatMetaPage* meta = (TDatMetaPage*)buf.Get();
+ if (gotrecsig)
+ *gotrecsig = meta->RecordSig;
+ return Init(TFile(), meta, recsig);
+ }
+
+ int Init(const TFile& file, const TDatMetaPage* meta, ui32 recsig) {
+ if (!file.IsOpen() && !FileManip.IsOpen())
+ return MBDB_NOT_INITIALIZED;
+ if (file.IsOpen() && FileManip.IsOpen())
+ return MBDB_ALREADY_INITIALIZED;
+ if (file.IsOpen()) {
+ Error = FileManip.Init(file);
+ if (Error)
+ return Error;
+ }
+
+ if (meta->MetaSig != METASIG)
+ Error = MBDB_BAD_METAPAGE;
+ else if (meta->RecordSig != recsig)
+ Error = MBDB_BAD_RECORDSIG;
+
+ if (Error) {
+ FileManip.Close();
+ return Error;
+ }
+
+ i64 flength = FileManip.GetLength();
+ if (flength >= 0) {
+ i64 fsize = flength;
+ fsize -= METASIZE;
+ if (fsize % meta->PageSize)
+ return Error = MBDB_BAD_FILE_SIZE;
+ Pagenum = (int)(fsize / meta->PageSize);
+ } else {
+ Pagenum = -1;
+ }
+ Pagesize = meta->PageSize;
+ Recordsig = meta->RecordSig;
+ Error = Eof = 0;
+ return Error;
+ }
+
+ int ReadPages(iovec* vec, int nvec, int* pages) {
+ *pages = 0;
+
+ if (Eof || Error)
+ return Error;
+
+ ssize_t size = 0, delta = 0, total = 0;
+ iovec* pvec = vec;
+ int vsize = nvec;
+
+ while (vsize && (size = Readv(FileManip, pvec, (int)Min(vsize, 16))) > 0) {
+ total += size;
+ if (delta) {
+ size += delta;
+ pvec->iov_len += delta;
+ pvec->iov_base = (char*)pvec->iov_base - delta;
+ delta = 0;
+ }
+ while (size) {
+ if ((size_t)size >= pvec->iov_len) {
+ size -= pvec->iov_len;
+ ++pvec;
+ --vsize;
+ } else {
+ delta = size;
+ pvec->iov_len -= size;
+ pvec->iov_base = (char*)pvec->iov_base + size;
+ size = 0;
+ }
+ }
+ }
+ if (delta) {
+ pvec->iov_len += delta;
+ pvec->iov_base = (char*)pvec->iov_base - delta;
+ }
+ if (size < 0)
+ return Error = errno ? errno : MBDB_READ_ERROR;
+ if (total % Pagesize)
+ return Error = MBDB_BAD_FILE_SIZE;
+ if (vsize)
+ Eof = 1;
+ *pages = total / Pagesize; // it would be better to assign it after the for-loops
+ for (; total; ++vec, total -= size)
+ for (size = 0; size < total && (size_t)size < vec->iov_len; size += Pagesize)
+ if (((TDatPage*)((char*)vec->iov_base + size))->PageSig != PAGESIG)
+ return Error = MBDB_BAD_PAGESIG;
+ return Error;
+ }
+
+ int GotoPage(int page) {
+ if (Error)
+ return Error;
+ Eof = 0;
+ i64 offset = (i64)page * Pagesize + METASIZE;
+ if (offset != FileManip.Seek(offset, SEEK_SET))
+ Error = MBDB_BAD_FILE_SIZE;
+ return Error;
+ }
+
+ int Term() {
+ return FileManip.Close();
+ }
+
+ size_t Pagesize;
+ int Fd;
+ int Eof;
+ int Error;
+ int Pagenum; //!< number of pages in this file
+ ui32 Recordsig;
+};
+
+template <class TBaseReader>
+class TMappedInputPageIterator: public TBaseReader {
+public:
+ typedef TBaseReader TReader;
+
+ TMappedInputPageIterator() {
+ Term();
+ }
+
+ ~TMappedInputPageIterator() {
+ Term();
+ }
+
+ TDatPage* Current() {
+ return CurPage;
+ }
+
+ inline size_t GetPageSize() const {
+ return TReader::GetPageSize();
+ }
+
+ inline int GetPageNum() const {
+ return PageNum;
+ }
+
+ inline int IsEof() const {
+ return Eof;
+ }
+
+ inline int IsFrozen() const {
+ return 0;
+ }
+
+ TDatPage* Next() {
+ i64 pos = (i64)(++PageNum) * GetPageSize() + METASIZE;
+ if (pos < 0 || pos >= (i64)TReader::GetSize()) {
+ Eof = 1;
+ return CurPage = nullptr;
+ }
+ return CurPage = (TDatPage*)((char*)TReader::GetData() + pos);
+ }
+
+protected:
+ int Init(size_t /*pages*/, int /*pagesOrBytes*/) {
+ Term();
+ Eof = 0;
+ return 0;
+ }
+
+ int Term() {
+ PageNum = -1;
+ Eof = 1;
+ CurPage = nullptr;
+ return 0;
+ }
+
+ TDatPage* GotoPage(int pageno) {
+ PageNum = pageno - 1;
+ Eof = 0;
+ return Next();
+ }
+
+ int PageNum, Eof, Pages, Pagenum;
+ TDatPage* CurPage;
+};
+
+using TInputPageFile = TInputPageFileImpl<TInputFileManip>;
+
+template <class TVal,
+ typename TBaseRecIter = TInputRecordIterator<TVal, TInputPageIterator<TInputPageFile>>>
+class TInDatFileImpl: public TBaseRecIter {
+public:
+ typedef TBaseRecIter TRecIter;
+ typedef typename TRecIter::TPageIter TPageIter;
+ typedef typename TRecIter::TPageIter::TReader TReader;
+ using TRecIter::GotoPage;
+
+ int Open(const char* fname, size_t pages = 1, int pagesOrBytes = 1, ui32* gotRecordSig = nullptr, bool direct = false) {
+ int ret = TReader::Init(fname, TVal::RecordSig, gotRecordSig, direct);
+ return ret ? ret : Open2(pages, pagesOrBytes);
+ }
+
+ int Open(const TFile& file, size_t pages = 1, int pagesOrBytes = 1, ui32* gotRecordSig = nullptr) {
+ int ret = TReader::Init(file, TVal::RecordSig, gotRecordSig);
+ return ret ? ret : Open2(pages, pagesOrBytes);
+ }
+
+ int Open(TAutoPtr<IInputStream> input, size_t pages = 1, int pagesOrBytes = 1, ui32* gotRecordSig = nullptr) {
+ int ret = TReader::Init(input, TVal::RecordSig, gotRecordSig);
+ return ret ? ret : Open2(pages, pagesOrBytes);
+ }
+
+ int Open(const TFile& file, const TDatMetaPage* meta, size_t pages = 1, int pagesOrBytes = 1) {
+ int ret = TReader::Init(file, meta, TVal::RecordSig);
+ return ret ? ret : Open2(pages, pagesOrBytes);
+ }
+
+ int Close() {
+ int ret1 = TRecIter::Term();
+ int ret2 = TPageIter::Term();
+ int ret3 = TReader::Term();
+ return ret1 ? ret1 : ret2 ? ret2 : ret3;
+ }
+
+ const TVal* GotoLastPage() {
+ return TReader::GetLastPage() <= 0 ? nullptr : TRecIter::GotoPage(TReader::GetLastPage() - 1);
+ }
+
+private:
+ int Open2(size_t pages, int pagesOrBytes) {
+ int ret = TPageIter::Init(pages, pagesOrBytes);
+ if (!ret)
+ ret = TRecIter::Init();
+ if (ret)
+ Close();
+ return ret;
+ }
+};
+
+template <class TVal>
+class TInIndexFile: protected TInDatFileImpl<TVal> {
+ typedef TInDatFileImpl<TVal> TDatFile;
+ typedef typename TDatFile::TRecIter TRecIter;
+ typedef typename TRecIter::TPageIter TPageIter;
+ typedef typename TExtInfoType<TVal>::TResult TExtInfo;
+
+public:
+ using TDatFile::IsOpen;
+
+ TInIndexFile()
+ : Index0(nullptr)
+ {
+ }
+
+ int Open(const char* fname, size_t pages = 2, int pagesOrBytes = 1, ui32* gotRecordSig = nullptr) {
+ int ret = TDatFile::Open(fname, pages, pagesOrBytes, gotRecordSig);
+ if (ret)
+ return ret;
+ if (!(Index0 = (TDatPage*)malloc(TPageIter::GetPageSize()))) {
+ TDatFile::Close();
+ return MBDB_NO_MEMORY;
+ }
+ if (!TExtInfoType<TVal>::Exists && SizeOf((TVal*)nullptr))
+ RecsOnPage = (TPageIter::GetPageSize() - sizeof(TDatPage)) / DatCeil(SizeOf((TVal*)nullptr));
+ TDatFile::Next();
+ memcpy(Index0, TPageIter::Current(), TPageIter::GetPageSize());
+ return 0;
+ }
+
+ int Close() {
+ free(Index0);
+ Index0 = nullptr;
+ return TDatFile::Close();
+ }
+
+ inline int GetError() const {
+ return TDatFile::GetError();
+ }
+
+ int FindKey(const TVal* akey, const TExtInfo* extInfo = nullptr) {
+ assert(IsOpen());
+ if (TExtInfoType<TVal>::Exists || !SizeOf((TVal*)nullptr))
+ return FindVszKey(akey, extInfo);
+ int num = FindKeyOnPage(Index0, akey);
+ TDatPage* page = TPageIter::GotoPage(num + 1);
+ if (!page)
+ return 0;
+ num = FindKeyOnPage(page, akey);
+ num += (TPageIter::GetPageNum() - 1) * RecsOnPage;
+ return num;
+ }
+
+ int FindVszKey(const TVal* akey, const TExtInfo* extInfo = NULL) {
+ int num = FindVszKeyOnPage(Index0, akey, extInfo);
+ int num_add = 0;
+ for (int p = 0; p < num; p++) {
+ TDatPage* page = TPageIter::GotoPage(p + 1);
+ if (!page)
+ return 0;
+ num_add += page->RecNum;
+ }
+ TDatPage* page = TPageIter::GotoPage(num + 1);
+ if (!page)
+ return 0;
+ num = FindVszKeyOnPage(page, akey, extInfo);
+ num += num_add;
+ return num;
+ }
+
+protected:
+ int FindKeyOnPage(TDatPage* page, const TVal* key) {
+ int left = 0;
+ int right = page->RecNum - 1;
+ int recsize = DatCeil(SizeOf((TVal*)nullptr));
+ while (left < right) {
+ int middle = (left + right) >> 1;
+ if (*((TVal*)((char*)page + sizeof(TDatPage) + middle * recsize)) < *key)
+ left = middle + 1;
+ else
+ right = middle;
+ }
+ //borders check (left and right)
+ return (left == 0 || *((TVal*)((char*)page + sizeof(TDatPage) + left * recsize)) < *key) ? left : left - 1;
+ }
+
+ // will deserialize rawExtinfoA to extInfoA only if necessery
+ inline bool KeyLess_(const TVal* a, const TVal* b,
+ TExtInfo* extInfoA, const TExtInfo* extInfoB,
+ const ui8* rawExtInfoA, size_t rawLen) {
+ if (*a < *b) {
+ return true;
+ } else if (!extInfoB || *b < *a) {
+ return false;
+ } else {
+ // *a == *b && extInfoB
+ Y_PROTOBUF_SUPPRESS_NODISCARD extInfoA->ParseFromArray(rawExtInfoA, rawLen);
+ return (*extInfoA < *extInfoB);
+ }
+ }
+
+ int FindVszKeyOnPage(TDatPage* page, const TVal* key, const TExtInfo* extInfo) {
+ TVal* cur = (TVal*)((char*)page + sizeof(TDatPage));
+ ui32 recnum = page->RecNum;
+ if (!TExtInfoType<TVal>::Exists) {
+ for (; recnum > 0 && *cur < *key; --recnum)
+ cur = (TVal*)((char*)cur + DatCeil(SizeOf(cur)));
+ } else {
+ size_t ll;
+ size_t l;
+ size_t sz = NMicroBDB::SizeOfExt(cur, &ll, &l);
+ TExtInfo ei;
+ for (; recnum > 0 && KeyLess_(cur, key, &ei, extInfo, (ui8*)cur + sz + ll, l); --recnum) {
+ cur = (TVal*)((ui8*)cur + DatCeil(sz + ll + l));
+ sz = NMicroBDB::SizeOfExt(cur, &ll, &l);
+ }
+ }
+
+ int idx = page->RecNum - recnum - 1;
+ return (idx >= 0) ? idx : 0;
+ }
+
+ TDatPage* Index0;
+ int RecsOnPage;
+};
+
+template <class TVal, class TKey, class TPageIterator = TInputPageIterator<TInputPageFile>>
+class TKeyFileMixin: public TInDatFileImpl<TVal, TInputRecordIterator<TVal, TPageIterator>> {
+protected:
+ TInIndexFile<TKey> KeyFile;
+};
+
+template <class TVal, class TKey, class TBase = TKeyFileMixin<TVal, TKey>>
+class TDirectInDatFile: public TBase {
+ typedef TBase TDatFile;
+ typedef typename TDatFile::TRecIter TRecIter;
+ typedef typename TDatFile::TPageIter TPageIter;
+
+public:
+ void Open(const char* path, size_t pages = 1, size_t keypages = 1, int pagesOrBytes = 1) {
+ int ret;
+ ui32 gotRecordSig = 0;
+
+ ret = TDatFile::Open(path, pages, pagesOrBytes, &gotRecordSig);
+ if (ret) {
+ ythrow yexception() << ErrorMessage(ret, "Failed to open input file", path, TVal::RecordSig, gotRecordSig);
+ }
+ char KeyName[PATH_MAX + 1];
+ if (DatNameToIdx(KeyName, path)) {
+ ythrow yexception() << ErrorMessage(MBDB_BAD_FILENAME, "Failed to open input file", path);
+ }
+ gotRecordSig = 0;
+ ret = KeyFile.Open(KeyName, keypages, 1, &gotRecordSig);
+ if (ret) {
+ ythrow yexception() << ErrorMessage(ret, "Failed to open input keyfile", KeyName, TKey::RecordSig, gotRecordSig);
+ }
+ }
+
+ void Close() {
+ int ret;
+
+ if (TDatFile::IsOpen() && (ret = TDatFile::GetError()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error before closing input file");
+ if ((ret = TDatFile::Close()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error while closing input file");
+
+ if (KeyFile.IsOpen() && (ret = KeyFile.GetError()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error before closing input keyfile");
+ if ((ret = KeyFile.Close()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error while closing input keyfile");
+ }
+
+ const TVal* FindRecord(const TKey* key, const typename TExtInfoType<TKey>::TResult* extInfo = nullptr) {
+ int page = KeyFile.FindKey(key, extInfo);
+ const TVal* val = TRecIter::GotoPage(page);
+ if (!TExtInfoType<TVal>::Exists || !extInfo) {
+ TKey k;
+ while (val) {
+ TMakeExtKey<TVal, TKey>::Make(&k, nullptr, val, nullptr);
+ if (!(k < *key))
+ break;
+ val = TRecIter::Next();
+ }
+ } else {
+ typename TExtInfoType<TVal>::TResult valExt;
+ TKey k;
+ typename TExtInfoType<TKey>::TResult kExt;
+ while (val) {
+ TRecIter::GetExtInfo(&valExt);
+ TMakeExtKey<TVal, TKey>::Make(&k, &kExt, val, &valExt);
+ if (*key < k || !(k < *key) && !(kExt < *extInfo)) // k > *key || k == *key && kExt >= *extInfo
+ break;
+ val = TRecIter::Next();
+ }
+ }
+ return val;
+ }
+
+ int FindPagesNo(const TKey* key, const typename TExtInfoType<TVal>::TResult* extInfo = NULL) {
+ return KeyFile.FindKey(key, extInfo);
+ }
+
+protected:
+ using TBase::KeyFile;
+};
diff --git a/library/cpp/microbdb/microbdb.cpp b/library/cpp/microbdb/microbdb.cpp
new file mode 100644
index 0000000000..c10dbdf126
--- /dev/null
+++ b/library/cpp/microbdb/microbdb.cpp
@@ -0,0 +1 @@
+#include "microbdb.h"
diff --git a/library/cpp/microbdb/microbdb.h b/library/cpp/microbdb/microbdb.h
new file mode 100644
index 0000000000..7521887337
--- /dev/null
+++ b/library/cpp/microbdb/microbdb.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <util/folder/dirut.h>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4706) /*assignment within conditional expression*/
+#pragma warning(disable : 4267) /*conversion from 'size_t' to 'type', possible loss of data*/
+#endif
+
+#include "align.h"
+#include "extinfo.h"
+#include "header.h"
+#include "reader.h"
+#include "heap.h"
+#include "file.h"
+#include "sorter.h"
+#include "input.h"
+#include "output.h"
+#include "sorterdef.h"
+
+inline int MakeSorterTempl(char path[/*FILENAME_MAX*/], const char* prefix) {
+ int ret = MakeTempDir(path, prefix);
+ if (!ret && strlcat(path, "%06d", FILENAME_MAX) > FILENAME_MAX - 100)
+ ret = EINVAL;
+ if (ret)
+ path[0] = 0;
+ return ret;
+}
+
+inline int GetMeta(TFile& file, TDatMetaPage* meta) {
+ ui8 buf[METASIZE], *ptr = buf;
+ ssize_t size = sizeof(buf), ret;
+ while (size && (ret = file.Read(ptr, size)) > 0) {
+ size -= ret;
+ ptr += ret;
+ }
+ if (size)
+ return MBDB_BAD_FILE_SIZE;
+ ptr = buf; // gcc 4.4 warning fix
+ *meta = *(TDatMetaPage*)ptr;
+ return (meta->MetaSig == METASIG) ? 0 : MBDB_BAD_METAPAGE;
+}
+
+template <class TRec>
+inline bool IsDatFile(const char* fname) {
+ TDatMetaPage meta;
+ TFile f(fname, RdOnly);
+ return !GetMeta(f, &meta) && meta.RecordSig == TRec::RecordSig;
+}
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/library/cpp/microbdb/noextinfo.proto b/library/cpp/microbdb/noextinfo.proto
new file mode 100644
index 0000000000..6a78882e07
--- /dev/null
+++ b/library/cpp/microbdb/noextinfo.proto
@@ -0,0 +1,4 @@
+
+message TNoExtInfo {
+}
+
diff --git a/library/cpp/microbdb/output.h b/library/cpp/microbdb/output.h
new file mode 100644
index 0000000000..d0ecab2108
--- /dev/null
+++ b/library/cpp/microbdb/output.h
@@ -0,0 +1,1049 @@
+#pragma once
+
+#include "header.h"
+#include "file.h"
+
+#include <util/generic/buffer.h>
+#include <util/memory/tempbuf.h>
+
+#include <sys/uio.h>
+
+template <class TFileManip>
+inline ssize_t Writev(TFileManip& fileManip, const struct iovec* iov, int iovcnt) {
+ ssize_t written_count = 0;
+ for (int n = 0; n < iovcnt; n++) {
+ ssize_t last_write = fileManip.Write(iov[n].iov_base, iov[n].iov_len);
+ if (last_write < 0)
+ return -1;
+ written_count += last_write;
+ }
+ return written_count;
+}
+
+//*********************************************************************
+struct TFakeIndexer {
+ inline void NextPage(TDatPage*) noexcept {
+ }
+};
+
+struct TCallbackIndexer {
+ typedef void (*TCallback)(void* This, const TDatPage* page);
+
+ TCallbackIndexer() {
+ Callback = nullptr;
+ }
+
+ void SetCallback(void* t, TCallback c) {
+ This = t;
+ Callback = c;
+ }
+
+ void NextPage(TDatPage* dat) {
+ Callback(This, dat);
+ }
+
+ TCallback Callback;
+ void* This;
+};
+
+template <class TVal, typename TBasePageIter, typename TBaseIndexer = TFakeIndexer, typename TCompressor = TFakeCompression>
+class TOutputRecordIterator;
+
+template <class TVal, typename TBasePageIter, typename TBaseIndexer>
+class TOutputRecordIterator<TVal, TBasePageIter, TBaseIndexer, TFakeCompression>
+ : public TBasePageIter, public TBaseIndexer {
+public:
+ enum EOffset {
+ WrongOffset = size_t(-1)
+ };
+
+ typedef TBasePageIter TPageIter;
+ typedef TBaseIndexer TIndexer;
+
+ TOutputRecordIterator() {
+ Clear();
+ }
+
+ ~TOutputRecordIterator() {
+ Term();
+ }
+
+ inline const TVal* Current() const {
+ return Rec;
+ }
+
+ const TVal* Push(const TVal* v, const typename TExtInfoType<TVal>::TResult* extInfo = nullptr) {
+ NMicroBDB::AssertValid(v);
+ size_t len = SizeOf(v);
+ if (!TExtInfoType<TVal>::Exists)
+ return (Reserve(len)) ? (TVal*)memcpy(Rec, v, len) : nullptr;
+ else if (extInfo) {
+ size_t extSize = extInfo->ByteSize();
+ size_t extLenSize = len_long((i64)extSize);
+ if (!Reserve(len + extLenSize + extSize))
+ return nullptr;
+ memcpy(Rec, v, len);
+ out_long((i64)extSize, (char*)Rec + len);
+ extInfo->SerializeWithCachedSizesToArray((ui8*)Rec + len + extLenSize);
+ return Rec;
+ } else {
+ size_t extLenSize = len_long((i64)0);
+ if (!Reserve(len + extLenSize))
+ return nullptr;
+ memcpy(Rec, v, len);
+ out_long((i64)0, (char*)Rec + len);
+ return Rec;
+ }
+ }
+
+ const TVal* Push(const TVal* v, const ui8* extInfoRaw, size_t extLen) {
+ NMicroBDB::AssertValid(v);
+ size_t sz = SizeOf(v);
+ if (!Reserve(sz + extLen))
+ return nullptr;
+ memcpy(Rec, v, sz);
+ memcpy((ui8*)Rec + sz, extInfoRaw, extLen);
+ return Rec;
+ }
+
+ // use values stored in microbdb readers/writers internal buffer only.
+ // method expects serialized extInfo after this record
+ const TVal* PushWithExtInfo(const TVal* v) {
+ NMicroBDB::AssertValid(v);
+ size_t extSize;
+ size_t extLenSize;
+ size_t sz = NMicroBDB::SizeOfExt(v, &extLenSize, &extSize);
+ sz += extLenSize + extSize;
+ if (!Reserve(sz))
+ return nullptr;
+ memcpy(Rec, v, sz);
+ return Rec;
+ }
+
+ TVal* Reserve(size_t len) {
+ if (CurLen + DatCeil(len) > TPageIter::GetPageSize()) {
+ if (sizeof(TDatPage) + DatCeil(len) > TPageIter::GetPageSize())
+ return Rec = nullptr;
+ if (TPageIter::Current() && RecNum) {
+ TPageIter::Current()->RecNum = RecNum;
+ TPageIter::Current()->Format = MBDB_FORMAT_RAW;
+ memset((char*)TPageIter::Current() + CurLen, 0, TPageIter::GetPageSize() - CurLen);
+ TIndexer::NextPage(TPageIter::Current());
+ RecNum = 0;
+ }
+ if (!TPageIter::Next()) {
+ CurLen = TPageIter::GetPageSize();
+ return Rec = nullptr;
+ }
+ CurLen = sizeof(TDatPage);
+ }
+ LenForOffset = CurLen;
+ Rec = (TVal*)((char*)TPageIter::Current() + CurLen);
+ DatSet(Rec, len);
+
+ CurLen += DatCeil(len);
+
+ ++RecNum;
+ return Rec;
+ }
+
+ void Flush() {
+ TPageIter::Current()->RecNum = RecNum;
+ TPageIter::Current()->Format = MBDB_FORMAT_RAW;
+ }
+
+ size_t Offset() const {
+ return Rec ? TPageIter::Offset() + LenForOffset : WrongOffset;
+ }
+
+ void ResetDat() {
+ CurLen = (char*)Rec - (char*)TPageIter::Current();
+ size_t len;
+ if (!TExtInfoType<TVal>::Exists) {
+ len = SizeOf(Rec);
+ } else {
+ size_t ll;
+ size_t l;
+ len = NMicroBDB::SizeOfExt(Rec, &ll, &l);
+ len += ll + l;
+ }
+ CurLen += DatCeil(len);
+ }
+
+protected:
+ void Clear() {
+ Rec = nullptr;
+ RecNum = 0;
+ CurLen = 0;
+ LenForOffset = 0;
+ }
+
+ int Init() {
+ Clear();
+ CurLen = TPageIter::GetPageSize();
+ return 0;
+ }
+
+ int Term() {
+ if (TPageIter::Current()) {
+ TPageIter::Current()->RecNum = RecNum;
+ TPageIter::Current()->Format = MBDB_FORMAT_RAW;
+ memset((char*)TPageIter::Current() + CurLen, 0, TPageIter::GetPageSize() - CurLen);
+ RecNum = 0;
+ }
+ int ret = !TPageIter::Current() && RecNum;
+ Clear();
+ return ret;
+ }
+
+ int GotoPage(int pageno) {
+ if (TPageIter::Current()) {
+ TPageIter::Current()->RecNum = RecNum;
+ TPageIter::Current()->Format = MBDB_FORMAT_RAW;
+ memset((char*)TPageIter::Current() + CurLen, 0, TPageIter::GetPageSize() - CurLen);
+ }
+ int ret = TPageIter::GotoPage(pageno);
+ if (!ret) {
+ RecNum = 0;
+ CurLen = sizeof(TDatPage);
+ }
+ return ret;
+ }
+
+ TVal* Rec;
+ int RecNum;
+ size_t CurLen;
+ size_t LenForOffset;
+};
+
+template <class TVal, typename TBasePageIter, typename TBaseIndexer, typename TAlgorithm>
+class TOutputRecordIterator
+ : public TBasePageIter,
+ public TBaseIndexer,
+ private TAlgorithm {
+ class TPageBuffer {
+ public:
+ void Init(size_t page) {
+ Pos = 0;
+ RecNum = 0;
+ Size = Min(page / 2, size_t(64 << 10));
+ Data.Reset(new ui8[Size]);
+ }
+
+ void Clear() {
+ Pos = 0;
+ RecNum = 0;
+ }
+
+ inline bool Empty() const {
+ return RecNum == 0;
+ }
+
+ public:
+ size_t Size;
+ size_t Pos;
+ int RecNum;
+ TArrayHolder<ui8> Data;
+ };
+
+public:
+ typedef TBasePageIter TPageIter;
+ typedef TBaseIndexer TIndexer;
+
+ TOutputRecordIterator()
+ : Rec(nullptr)
+ , RecNum(0)
+ {
+ }
+
+ ~TOutputRecordIterator() {
+ Term();
+ }
+
+ const TVal* Current() const {
+ return Rec;
+ }
+
+ const TVal* Push(const TVal* v, const typename TExtInfoType<TVal>::TResult* extInfo = nullptr) {
+ NMicroBDB::AssertValid(v);
+ size_t len = SizeOf(v);
+ if (!TExtInfoType<TVal>::Exists)
+ return (Reserve(len)) ? (TVal*)memcpy((TVal*)Rec, v, len) : nullptr;
+ else if (extInfo) {
+ size_t extSize = extInfo->ByteSize();
+ size_t extLenSize = len_long((i64)extSize);
+ if (!Reserve(len + extLenSize + extSize))
+ return nullptr;
+ memcpy(Rec, v, len);
+ out_long((i64)extSize, (char*)Rec + len);
+ extInfo->SerializeWithCachedSizesToArray((ui8*)Rec + len + extLenSize);
+ return Rec;
+ } else {
+ size_t extLenSize = len_long((i64)0);
+ if (!Reserve(len + extLenSize))
+ return nullptr;
+ memcpy(Rec, v, len);
+ out_long((i64)0, (char*)Rec + len);
+ return Rec;
+ }
+ }
+
+ const TVal* Push(const TVal* v, const ui8* extInfoRaw, size_t extLen) {
+ NMicroBDB::AssertValid(v);
+ size_t sz = SizeOf(v);
+ if (!Reserve(sz + extLen))
+ return NULL;
+ memcpy(Rec, v, sz);
+ memcpy((ui8*)Rec + sz, extInfoRaw, extLen);
+ return Rec;
+ }
+
+ // use values stored in microbdb readers/writers internal buffer only.
+ // method expects serialized extInfo after this record
+ const TVal* PushWithExtInfo(const TVal* v) {
+ NMicroBDB::AssertValid(v);
+ size_t extSize;
+ size_t extLenSize;
+ size_t sz = NMicroBDB::SizeOfExt(v, &extLenSize, &extSize);
+ sz += extLenSize + extSize;
+ if (!Reserve(sz))
+ return nullptr;
+ memcpy(Rec, v, sz);
+ return Rec;
+ }
+
+ TVal* Reserve(const size_t len) {
+ const size_t aligned = DatCeil(len);
+
+ if (!TPageIter::Current()) { // Allocate fist page
+ if (!TPageIter::Next()) {
+ CurLen = TPageIter::GetPageSize();
+ return Rec = nullptr;
+ }
+ CurLen = sizeof(TDatPage) + sizeof(TCompressedPage);
+ }
+
+ if (Buffer.Pos + aligned > Buffer.Size) {
+ if (Buffer.Pos == 0)
+ return Rec = nullptr;
+ if (FlushBuffer())
+ return Rec = nullptr;
+ if (Buffer.Pos + aligned + sizeof(TDatPage) + sizeof(TCompressedPage) > Buffer.Size)
+ return Rec = nullptr;
+ }
+
+ Rec = (TVal*)((char*)Buffer.Data.Get() + Buffer.Pos);
+ DatSet(Rec, len); // len is correct because DatSet set align tail to zero
+
+ Buffer.RecNum++;
+ Buffer.Pos += aligned;
+ ++RecNum;
+ return Rec;
+ }
+
+ void Flush() {
+ if (!Buffer.Empty()) {
+ FlushBuffer();
+ TPageIter::Current()->RecNum = RecNum;
+ TPageIter::Current()->Format = MBDB_FORMAT_COMPRESSED;
+ }
+ }
+
+ size_t Offset() const {
+ // According to vadya@ there is no evil to return 0 all the time
+ return 0;
+ }
+
+ void ResetDat() {
+ Buffer.Pos = (char*)Rec - (char*)Buffer.Data.Get();
+ size_t len = SizeOf(Rec);
+ Buffer.Pos += DatCeil(len);
+ }
+
+protected:
+ void Clear() {
+ RecNum = 0;
+ Rec = nullptr;
+ Count = 0;
+ CurLen = sizeof(TDatPage) + sizeof(TCompressedPage);
+ Buffer.Clear();
+ }
+
+ int Init() {
+ Clear();
+ Buffer.Init(TPageIter::GetPageSize());
+ TAlgorithm::Init();
+ return 0;
+ }
+
+ int Term() {
+ if (TPageIter::Current())
+ Commit();
+ int ret = !TPageIter::Current() && RecNum;
+ Clear();
+ TAlgorithm::Term();
+ return ret;
+ }
+
+ int GotoPage(int pageno) {
+ if (TPageIter::Current())
+ Commit();
+ int ret = TPageIter::GotoPage(pageno);
+ if (!ret)
+ Reset();
+ return ret;
+ }
+
+private:
+ void Commit() {
+ Flush();
+ TPageIter::Current()->RecNum = RecNum;
+ TPageIter::Current()->Format = MBDB_FORMAT_COMPRESSED;
+ SetCompressedPageHeader();
+
+ memset((char*)TPageIter::Current() + CurLen, 0, TPageIter::GetPageSize() - CurLen);
+ RecNum = 0;
+ Count = 0;
+ }
+
+ inline void SetCompressedPageHeader() {
+ TCompressedPage* const hdr = (TCompressedPage*)((ui8*)TPageIter::Current() + sizeof(TDatPage));
+
+ hdr->BlockCount = Count;
+ hdr->Algorithm = TAlgorithm::Code;
+ hdr->Version = 0;
+ hdr->Reserved = 0;
+ }
+
+ inline void Reset() {
+ RecNum = 0;
+ CurLen = sizeof(TDatPage) + sizeof(TCompressedPage);
+ Count = 0;
+ Buffer.Clear();
+ }
+
+ int FlushBuffer() {
+ TArrayHolder<ui8> data;
+ const ui8* const buf = Buffer.Data.Get();
+ size_t first = 0;
+
+ if (!TExtInfoType<TVal>::Exists)
+ first = DatCeil(SizeOf((TVal*)buf));
+ else {
+ size_t ll;
+ size_t l;
+ first = NMicroBDB::SizeOfExt((const TVal*)buf, &ll, &l);
+ first = DatCeil(first + ll + l);
+ }
+
+ size_t total = sizeof(NMicroBDB::TCompressedHeader) + first + ((Buffer.RecNum == 1) ? 0 : TAlgorithm::CompressBound(Buffer.Pos - first));
+ size_t real = total;
+
+ {
+ ui8* p = nullptr;
+ NMicroBDB::TCompressedHeader* hdr = nullptr;
+
+ // 1. Choose data destination (temporary buffer or dat-page)
+ if (CurLen + total > TPageIter::GetPageSize()) {
+ data.Reset(new ui8[total]);
+
+ hdr = (NMicroBDB::TCompressedHeader*)data.Get();
+ p = data.Get() + sizeof(NMicroBDB::TCompressedHeader);
+ } else {
+ p = (ui8*)TPageIter::Current() + CurLen;
+ hdr = (NMicroBDB::TCompressedHeader*)p;
+ p += sizeof(NMicroBDB::TCompressedHeader);
+ }
+
+ // 2. Compress data
+
+ // Fill header and first record
+ hdr->Original = Buffer.Pos;
+ hdr->Compressed = 0;
+ hdr->Count = Buffer.RecNum;
+ hdr->Reserved = 0;
+ memcpy(p, Buffer.Data.Get(), first);
+ // Fill compressed part
+ if (Buffer.RecNum > 1) {
+ size_t size = TAlgorithm::CompressBound(Buffer.Pos - first);
+
+ p += first;
+ TAlgorithm::Compress(p, size, buf + first, Buffer.Pos - first);
+
+ hdr->Compressed = size;
+
+ real = sizeof(NMicroBDB::TCompressedHeader) + first + size;
+ }
+ }
+
+ Y_ASSERT(sizeof(TDatPage) + sizeof(TCompressedPage) + real <= TPageIter::GetPageSize());
+
+ // 3. Check page capacity
+
+ if (CurLen + real > TPageIter::GetPageSize()) {
+ Y_ASSERT(data.Get() != nullptr);
+
+ if (TPageIter::Current() && RecNum) {
+ RecNum = RecNum - Buffer.RecNum;
+ TPageIter::Current()->RecNum = RecNum;
+ TPageIter::Current()->Format = MBDB_FORMAT_COMPRESSED;
+ SetCompressedPageHeader();
+ memset((char*)TPageIter::Current() + CurLen, 0, TPageIter::GetPageSize() - CurLen);
+ TIndexer::NextPage(TPageIter::Current());
+ RecNum = Buffer.RecNum;
+ Count = 0;
+ }
+ if (!TPageIter::Next()) {
+ CurLen = TPageIter::GetPageSize();
+ return MBDB_NO_MEMORY;
+ }
+ CurLen = sizeof(TDatPage) + sizeof(TCompressedPage);
+ }
+
+ // 4. Flush data and reset buffer state
+
+ if (data.Get())
+ memcpy((ui8*)TPageIter::Current() + CurLen, data.Get(), real);
+ CurLen += real;
+ ++Count;
+ Buffer.Clear();
+ return 0;
+ }
+
+private:
+ size_t CurLen;
+ TPageBuffer Buffer;
+ TVal* Rec;
+ ui32 Count; //! < count of compressed blocks on page
+public:
+ int RecNum;
+};
+
+template <typename TBaseWriter>
+class TOutputPageIterator: public TBaseWriter {
+public:
+ typedef TBaseWriter TWriter;
+
+ TOutputPageIterator()
+ : Buf(nullptr)
+ {
+ Clear();
+ }
+
+ ~TOutputPageIterator() {
+ Term();
+ }
+
+ TDatPage* Current() {
+ return CurPage;
+ }
+
+ size_t Offset() const {
+ //Cout << "PS = " << TWriter::GetPageSize() << "; PN = " << PageNum << "; MS = " << METASIZE << Endl;
+ return TWriter::GetPageSize() * PageNum + METASIZE;
+ }
+
+ int Freeze() {
+ return (Frozen = (PageNum == -1) ? 0 : (int)PageNum);
+ }
+
+ void Unfreeze() {
+ Frozen = -1;
+ }
+
+ inline int IsFrozen() const {
+ return Frozen + 1;
+ }
+
+ inline size_t GetPageSize() const {
+ return TWriter::GetPageSize();
+ }
+
+ inline int GetPageNum() const {
+ return (int)PageNum;
+ }
+
+ TDatPage* Next() {
+ if (PageNum >= Maxpage && WriteBuf())
+ return CurPage = nullptr;
+ CurPage = (TDatPage*)(Buf + ((++PageNum) % Bufpages) * GetPageSize());
+ memset(CurPage, 0, sizeof(TDatPage));
+ return CurPage;
+ }
+
+protected:
+ int Init(size_t pages, int pagesOrBytes) {
+ Term();
+ if (pagesOrBytes)
+ Bufpages = pages;
+ else
+ Bufpages = pages / GetPageSize();
+ Bufpages = Max<size_t>(1, Bufpages);
+ Maxpage = Bufpages - 1;
+ // if (!(Buf = (char*)malloc(Bufpages * GetPageSize())))
+ // return ENOMEM;
+ ABuf.Alloc(Bufpages * GetPageSize());
+ Buf = ABuf.Begin();
+ if (TWriter::Memo)
+ Freeze();
+ return 0;
+ }
+
+ int Term() {
+ Unfreeze();
+ int ret = (PageNum < 0) ? 0 : WriteBuf();
+ Clear();
+ return ret;
+ }
+
+ int GotoPage(int pageno) {
+ int ret = EAGAIN;
+ if (IsFrozen() || PageNum >= 0 && ((ret = WriteBuf())) || ((ret = TWriter::GotoPage(pageno))))
+ return ret;
+ PageNum = pageno;
+ Maxpage = Bufpages - 1 + pageno;
+ CurPage = (TDatPage*)(Buf + (PageNum % Bufpages) * GetPageSize());
+ memset(CurPage, 0, sizeof(TDatPage));
+ return 0;
+ }
+
+ void Clear() {
+ ABuf.Dealloc();
+ Buf = nullptr;
+ Maxpage = PageNum = Frozen = -1;
+ Bufpages = 0;
+ CurPage = nullptr;
+ }
+
+ int WriteBuf() {
+ int nvec;
+ iovec vec[2];
+ ssize_t minpage = Maxpage - Bufpages + 1;
+ ssize_t maxpage = Frozen == -1 ? PageNum : Frozen - 1;
+ if (maxpage < minpage)
+ return EAGAIN;
+ minpage %= Bufpages;
+ maxpage %= Bufpages;
+ if (maxpage < minpage) {
+ vec[0].iov_base = Buf + GetPageSize() * minpage;
+ vec[0].iov_len = GetPageSize() * (Bufpages - minpage);
+ vec[1].iov_base = Buf;
+ vec[1].iov_len = GetPageSize() * (maxpage + 1);
+ nvec = 2;
+ } else {
+ vec[0].iov_base = Buf + GetPageSize() * minpage;
+ vec[0].iov_len = GetPageSize() * (maxpage - minpage + 1);
+ nvec = 1;
+ }
+ if (TWriter::WritePages(vec, nvec))
+ return EIO;
+ Maxpage += (maxpage < minpage) ? (Bufpages - minpage + maxpage + 1) : (maxpage - minpage + 1);
+ return 0;
+ }
+
+ ssize_t Maxpage;
+ ssize_t Bufpages;
+ ssize_t PageNum;
+ int Frozen;
+ TDatPage* CurPage;
+ char* Buf;
+ TMappedAllocation ABuf;
+};
+
+template <class TFileManip>
+class TOutputPageFileImpl: private TNonCopyable {
+public:
+ TOutputPageFileImpl()
+ : Pagesize(0)
+ , Eof(1)
+ , Error(0)
+ , Memo(0)
+ , Recordsig(0)
+ {
+ }
+
+ ~TOutputPageFileImpl() {
+ Term();
+ }
+
+ inline int IsEof() const {
+ return Eof;
+ }
+
+ inline int GetError() const {
+ return Error;
+ }
+
+ inline bool IsOpen() const {
+ return FileManip.IsOpen();
+ }
+
+ inline size_t GetPageSize() const {
+ return Pagesize;
+ }
+
+ inline ui32 GetRecordSig() const {
+ return Recordsig;
+ }
+
+ int Init(const char* fname, size_t pagesize, ui32 recsig, bool direct = false) {
+ Memo = 0;
+ if (FileManip.IsOpen())
+ return MBDB_ALREADY_INITIALIZED;
+
+ if (!fname) {
+ Eof = Error = 0;
+ Pagesize = pagesize;
+ Recordsig = recsig;
+ Memo = 1;
+ return 0;
+ }
+
+ Error = FileManip.Open(fname, WrOnly | CreateAlways | ARW | AWOther | (direct ? DirectAligned : EOpenMode()));
+ if (Error)
+ return Error;
+ Error = Init(TFile(), pagesize, recsig);
+ if (Error) {
+ FileManip.Close();
+ unlink(fname);
+ }
+ return Error;
+ }
+
+ int Init(TAutoPtr<IOutputStream> output, size_t pagesize, ui32 recsig) {
+ Memo = 0;
+ if (FileManip.IsOpen()) {
+ return MBDB_ALREADY_INITIALIZED;
+ }
+
+ if (!output) {
+ Eof = Error = 0;
+ Pagesize = pagesize;
+ Recordsig = recsig;
+ Memo = 1;
+ return 0;
+ }
+
+ Error = FileManip.Open(output);
+ if (Error)
+ return Error;
+ Error = Init(TFile(), pagesize, recsig);
+ if (Error) {
+ FileManip.Close();
+ }
+ return Error;
+ }
+
+ int Init(const TFile& file, size_t pagesize, ui32 recsig) {
+ Memo = 0;
+ if (!file.IsOpen() && !FileManip.IsOpen())
+ return MBDB_NOT_INITIALIZED;
+ if (file.IsOpen() && FileManip.IsOpen())
+ return MBDB_ALREADY_INITIALIZED;
+ if (file.IsOpen()) {
+ Error = FileManip.Init(file);
+ if (Error)
+ return Error;
+ }
+
+ Eof = 1;
+ TTempBuf buf(METASIZE + FS_BLOCK_SIZE);
+ const char* ptr = (buf.Data() + FS_BLOCK_SIZE - ((ui64)buf.Data() & (FS_BLOCK_SIZE - 1)));
+ TDatMetaPage* meta = (TDatMetaPage*)ptr;
+
+ memset(buf.Data(), 0, buf.Size());
+ meta->MetaSig = METASIG;
+ meta->PageSize = Pagesize = pagesize;
+ meta->RecordSig = Recordsig = recsig;
+
+ ssize_t size = METASIZE, ret = 0;
+ while (size && (ret = FileManip.Write(ptr, (unsigned)size)) > 0) {
+ size -= ret;
+ ptr += ret;
+ }
+ if (size || ret <= 0) {
+ Term();
+ return Error = errno ? errno : MBDB_WRITE_ERROR;
+ }
+
+ Error = Eof = 0;
+ return Error;
+ }
+
+protected:
+ int WritePages(iovec* vec, int nvec) {
+ if (Error || Memo)
+ return Error;
+
+ ssize_t size, delta;
+ iovec* pvec;
+ int vsize;
+
+ for (vsize = 0, pvec = vec; vsize < nvec; vsize++, pvec++)
+ for (size = 0; (size_t)size < pvec->iov_len; size += Pagesize)
+ ((TDatPage*)((char*)pvec->iov_base + size))->PageSig = PAGESIG;
+
+ delta = size = 0;
+ pvec = vec;
+ vsize = nvec;
+ while (vsize && (size = Writev(FileManip, pvec, (int)Min(vsize, 16))) > 0) {
+ if (delta) {
+ size += delta;
+ pvec->iov_len += delta;
+ pvec->iov_base = (char*)pvec->iov_base - delta;
+ delta = 0;
+ }
+ while (size) {
+ if ((size_t)size >= pvec->iov_len) {
+ size -= pvec->iov_len;
+ ++pvec;
+ --vsize;
+ } else {
+ delta = size;
+ pvec->iov_len -= size;
+ pvec->iov_base = (char*)pvec->iov_base + size;
+ size = 0;
+ }
+ }
+ }
+ if (delta) {
+ pvec->iov_len += delta;
+ pvec->iov_base = (char*)pvec->iov_base - delta;
+ }
+ return Error = (!size && !vsize) ? 0 : errno ? errno : MBDB_WRITE_ERROR;
+ }
+
+ i64 Tell() {
+ return FileManip.RealSeek(0, SEEK_CUR);
+ }
+
+ int GotoPage(int pageno) {
+ if (Error || Memo)
+ return Error;
+ Eof = 0;
+ i64 offset = (i64)pageno * Pagesize + METASIZE;
+ if (offset != FileManip.Seek(offset, SEEK_SET))
+ Error = MBDB_BAD_FILE_SIZE;
+ return Error;
+ }
+
+ int Term() {
+ int ret = FileManip.Close();
+ Eof = 1;
+ Memo = 0;
+ if (!Error)
+ Error = ret;
+ return Error;
+ }
+
+ size_t Pagesize;
+ int Eof;
+ int Error;
+ int Memo;
+ ui32 Recordsig;
+
+private:
+ TFileManip FileManip;
+};
+
+using TOutputPageFile = TOutputPageFileImpl<TOutputFileManip>;
+
+template <class TVal,
+ typename TBaseRecIter = TOutputRecordIterator<TVal, TOutputPageIterator<TOutputPageFile>>>
+class TOutDatFileImpl: public TBaseRecIter {
+public:
+ typedef TBaseRecIter TRecIter;
+ typedef typename TRecIter::TPageIter TPageIter;
+ typedef typename TRecIter::TPageIter::TWriter TWriter;
+
+ int Open(const char* fname, size_t pagesize, size_t pages = 1, int pagesOrBytes = 1, bool direct = false) {
+ int ret = TWriter::Init(fname, pagesize, TVal::RecordSig, direct);
+ return ret ? ret : Open2(pages, pagesOrBytes);
+ }
+
+ int Open(const TFile& file, size_t pagesize, size_t pages = 1, int pagesOrBytes = 1) {
+ int ret = TWriter::Init(file, pagesize, TVal::RecordSig);
+ return ret ? ret : Open2(pages, pagesOrBytes);
+ }
+
+ int Open(TAutoPtr<IOutputStream> output, size_t pagesize, size_t pages = 1, int pagesOrBytes = 1) {
+ int ret = TWriter::Init(output, pagesize, TVal::RecordSig);
+ return ret ? ret : Open2(pages, pagesOrBytes);
+ }
+
+ int Close() {
+ int ret1 = TRecIter::Term();
+ int ret2 = TPageIter::Term();
+ int ret3 = TWriter::Term();
+ return ret1 ? ret1 : ret2 ? ret2 : ret3;
+ }
+
+private:
+ int Open2(size_t pages, int pagesOrBytes) {
+ int ret = TPageIter::Init(pages, pagesOrBytes);
+ if (!ret)
+ ret = TRecIter::Init();
+ if (ret)
+ Close();
+ return ret;
+ }
+};
+
+template <class TVal>
+class TOutIndexFile: public TOutDatFileImpl<
+ TVal,
+ TOutputRecordIterator<TVal, TOutputPageIterator<TOutputPageFile>, TCallbackIndexer, TFakeCompression>> {
+ typedef TOutDatFileImpl<
+ TVal,
+ TOutputRecordIterator<TVal, TOutputPageIterator<TOutputPageFile>, TCallbackIndexer, TFakeCompression>>
+ TDatFile;
+ typedef TOutIndexFile<TVal> TMyType;
+ typedef typename TDatFile::TRecIter TRecIter;
+ typedef typename TRecIter::TPageIter TPageIter;
+ typedef typename TRecIter::TIndexer TIndexer;
+
+public:
+ TOutIndexFile() {
+ TIndexer::SetCallback(this, DispatchCallback);
+ }
+
+ int Open(const char* fname, size_t pagesize, size_t pages, int pagesOrBytes = 1) {
+ int ret = TDatFile::Open(fname, pagesize, pages, pagesOrBytes);
+ if (ret)
+ return ret;
+ if ((ret = TRecIter::GotoPage(1))) {
+ TDatFile::Close();
+ return ret;
+ }
+ Index0.Clear();
+ return ret;
+ }
+
+ int Close() {
+ TPageIter::Unfreeze();
+ if (TRecIter::RecNum) {
+ TRecIter::Flush();
+ NextPage(TPageIter::Current());
+ }
+ int ret = 0;
+ if (Index0.Size() && !(ret = TRecIter::GotoPage(0))) {
+ const char* ptr = Index0.Begin();
+ size_t recSize;
+ while (ptr < Index0.End()) {
+ Y_ASSERT((size_t)(Index0.End() - ptr) >= sizeof(size_t));
+ memcpy(&recSize, ptr, sizeof(size_t));
+ ptr += sizeof(size_t);
+ Y_ASSERT((size_t)(Index0.End() - ptr) >= recSize);
+ ui8* buf = (ui8*)TRecIter::Reserve(recSize);
+ if (!buf) {
+ ret = MBDB_PAGE_OVERFLOW;
+ break;
+ }
+ memcpy(buf, ptr, recSize);
+ TRecIter::ResetDat();
+ ptr += recSize;
+ }
+ Index0.Clear();
+ ret = (TPageIter::GetPageNum() != 0) ? MBDB_PAGE_OVERFLOW : TPageIter::GetError();
+ }
+ int ret1 = TDatFile::Close();
+ return ret ? ret : ret1;
+ }
+
+protected:
+ TBuffer Index0;
+
+ void NextPage(const TDatPage* page) {
+ const TVal* first = (const TVal*)NMicroBDB::GetFirstRecord(page);
+ size_t sz;
+ if (!TExtInfoType<TVal>::Exists) {
+ sz = SizeOf(first);
+ } else {
+ size_t ll;
+ size_t l;
+ sz = NMicroBDB::SizeOfExt(first, &ll, &l);
+ sz += ll + l;
+ }
+ Index0.Append((const char*)&sz, sizeof(size_t));
+ Index0.Append((const char*)first, sz);
+ }
+
+ static void DispatchCallback(void* This, const TDatPage* page) {
+ ((TMyType*)This)->NextPage(page);
+ }
+};
+
+template <class TVal, class TKey, typename TCompressor = TFakeCompression, class TPageFile = TOutputPageFile>
+class TOutDirectFileImpl: public TOutDatFileImpl<
+ TVal,
+ TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TCallbackIndexer, TCompressor>> {
+ typedef TOutDatFileImpl<
+ TVal,
+ TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TCallbackIndexer, TCompressor>>
+ TDatFile;
+ typedef TOutDirectFileImpl<TVal, TKey, TCompressor, TPageFile> TMyType;
+ typedef typename TDatFile::TRecIter TRecIter;
+ typedef typename TRecIter::TPageIter TPageIter;
+ typedef typename TRecIter::TIndexer TIndexer;
+ typedef TOutIndexFile<TKey> TKeyFile;
+
+public:
+ TOutDirectFileImpl() {
+ TIndexer::SetCallback(this, DispatchCallback);
+ }
+
+ int Open(const char* fname, size_t pagesize, int pages = 1, size_t ipagesize = 0, size_t ipages = 1, int pagesOrBytes = 1) {
+ char iname[FILENAME_MAX];
+ int ret;
+ if (ipagesize == 0)
+ ipagesize = pagesize;
+ ret = TDatFile::Open(fname, pagesize, pages, pagesOrBytes);
+ ret = ret ? ret : DatNameToIdx(iname, fname);
+ ret = ret ? ret : KeyFile.Open(iname, ipagesize, ipages, pagesOrBytes);
+ if (ret)
+ TDatFile::Close();
+ return ret;
+ }
+
+ int Close() {
+ if (TRecIter::RecNum) {
+ TRecIter::Flush();
+ NextPage(TPageIter::Current());
+ }
+ int ret = KeyFile.Close();
+ int ret1 = TDatFile::Close();
+ return ret1 ? ret1 : ret;
+ }
+
+ int GetError() const {
+ return TDatFile::GetError() ? TDatFile::GetError() : KeyFile.GetError();
+ }
+
+protected:
+ TKeyFile KeyFile;
+
+ void NextPage(const TDatPage* page) {
+ typedef TMakeExtKey<TVal, TKey> TMakeExtKey;
+
+ TVal* val = (TVal*)NMicroBDB::GetFirstRecord(page);
+ TKey key;
+ if (!TMakeExtKey::Exists) {
+ TMakeExtKey::Make(&key, nullptr, val, nullptr);
+ KeyFile.Push(&key);
+ } else {
+ size_t ll;
+ size_t l;
+ size_t sz = NMicroBDB::SizeOfExt(val, &ll, &l);
+ typename TExtInfoType<TVal>::TResult valExt;
+ if (TExtInfoType<TVal>::Exists)
+ Y_PROTOBUF_SUPPRESS_NODISCARD valExt.ParseFromArray((ui8*)val + sz + ll, l);
+ typename TExtInfoType<TKey>::TResult keyExt;
+ TMakeExtKey::Make(&key, &keyExt, val, &valExt);
+ KeyFile.Push(&key, &keyExt);
+ }
+ }
+
+ static void DispatchCallback(void* This, const TDatPage* page) {
+ ((TMyType*)This)->NextPage(page);
+ }
+};
diff --git a/library/cpp/microbdb/powersorter.h b/library/cpp/microbdb/powersorter.h
new file mode 100644
index 0000000000..c40de9c23f
--- /dev/null
+++ b/library/cpp/microbdb/powersorter.h
@@ -0,0 +1,667 @@
+#pragma once
+
+#include "safeopen.h"
+
+#include <util/generic/vector.h>
+#include <util/generic/deque.h>
+#include <util/system/mutex.h>
+#include <util/system/condvar.h>
+#include <util/thread/pool.h>
+
+template <
+ class TRecord,
+ template <typename T> class TCompare,
+ class TSieve,
+ class TMemoFile = TOutDatFile<TRecord>>
+class TDatSorterBuf {
+public:
+ typedef TRecord TRec;
+ typedef TVector<TRec*> TVectorType;
+ typedef TMemoFile TMemo;
+ typedef TCompare<TRecord> TComp;
+
+public:
+ TDatSorterBuf(size_t memory, size_t pageSize)
+ : Memo("memo", pageSize, memory, 0)
+ , Cur()
+ {
+ Memo.Open(nullptr);
+ Memo.Freeze();
+ }
+
+ ~TDatSorterBuf() {
+ Vector.clear();
+ Memo.Close();
+ }
+
+ const TRec* Push(const TRec* v) {
+ const TRec* u = Memo.Push(v);
+ if (u)
+ Vector.push_back((TRec*)u);
+ return u;
+ }
+
+ const TRec* Next() {
+ if (Ptr == Vector.end()) {
+ if (Cur)
+ TSieve::Sieve(Cur, Cur);
+ Cur = nullptr;
+ } else {
+ Cur = *Ptr++;
+ if (!TIsSieveFake<TSieve>::Result)
+ while (Ptr != Vector.end() && TSieve::Sieve(Cur, *Ptr))
+ ++Ptr;
+ }
+ return Cur;
+ }
+
+ const TRec* Current() {
+ return Cur;
+ }
+
+ size_t Size() {
+ return Vector.size();
+ }
+
+ void Sort() {
+ Ptr = Vector.begin();
+ Cur = nullptr;
+
+ MBDB_SORT_FUN(Vector.begin(), Vector.end(), TComp());
+ }
+
+ void Clear() {
+ Vector.clear();
+ Memo.Freeze();
+ Ptr = Vector.begin();
+ Cur = nullptr;
+ }
+
+private:
+ TVectorType Vector;
+ TMemo Memo;
+
+ typename TVectorType::iterator
+ Ptr;
+ TRec* Cur;
+};
+
+template <
+ class TRecord,
+ class TInput,
+ template <typename T> class TCompare,
+ class TSieve>
+class TDatMerger {
+public:
+ typedef TRecord TRec;
+ typedef TCompare<TRecord> TComp;
+ typedef TSimpleSharedPtr<TInput> TInputPtr;
+ typedef TVector<TInputPtr> TInputVector;
+
+public:
+ ~TDatMerger() {
+ Close();
+ }
+
+ void Init(const TInputVector& inputs) {
+ Inputs = inputs;
+ TVector<TInput*> v;
+ for (int i = 0; i < Inputs.ysize(); ++i)
+ v.push_back(Inputs[i].Get());
+ HeapIter.Init(&v[0], v.size());
+ if (!TIsSieveFake<TSieve>::Result)
+ PNext = HeapIter.Next();
+ }
+
+ const TRec* Next() {
+ if (TIsSieveFake<TSieve>::Result) {
+ return HeapIter.Next();
+ }
+
+ if (!PNext) {
+ if (PCur) {
+ TSieve::Sieve(PCur, PCur);
+ PCur = nullptr;
+ }
+ return nullptr;
+ }
+
+ PCur = &Cur;
+ memcpy(PCur, PNext, SizeOf((const TRec*)PNext));
+
+ do {
+ PNext = HeapIter.Next();
+ } while (PNext && TSieve::Sieve(PCur, PNext));
+
+ return PCur;
+ }
+
+ const TRec* Current() {
+ return (TIsSieveFake<TSieve>::Result ? HeapIter.Current() : PCur);
+ }
+
+ void Close() {
+ Inputs.clear();
+ HeapIter.Term();
+ }
+
+private:
+ TInputVector Inputs;
+ THeapIter<TRec, TInput, TComp> HeapIter;
+ TRec Cur;
+ TRec* PCur = nullptr;
+ const TRec* PNext = nullptr;
+};
+
+class TPortionManager {
+public:
+ void Open(const char* tempDir) {
+ TGuard<TMutex> guard(Mutex);
+ TempDir = tempDir;
+ }
+
+ TString Next() {
+ TGuard<TMutex> guard(Mutex);
+ if (Portions == 0)
+ DoOpen();
+ TString fname = GeneratePortionFilename(Portions++);
+ return fname;
+ }
+
+ void Close() {
+ TGuard<TMutex> guard(Mutex);
+ Portions = 0;
+ }
+
+private:
+ void DoOpen() {
+ if (MakeSorterTempl(PortionFilenameTempl, TempDir.data())) {
+ PortionFilenameTempl[0] = 0;
+ ythrow yexception() << "portion-manager: bad tempdir \"" << TempDir.data() << "\": " << LastSystemErrorText();
+ }
+ }
+
+ TString GeneratePortionFilename(int i) {
+ char str[FILENAME_MAX];
+ snprintf(str, sizeof(str), PortionFilenameTempl, i);
+ return TString(str);
+ }
+
+private:
+ TMutex Mutex;
+
+ TString TempDir;
+ char PortionFilenameTempl[FILENAME_MAX] = {};
+ int Portions = 0;
+};
+
+// A merger powered by threads
+template <
+ class TRecord,
+ template <typename T> class TCompare,
+ class TSieve,
+ class TInput = TInDatFile<TRecord>,
+ class TOutput = TOutDatFile<TRecord>>
+class TPowerMerger {
+public:
+ typedef TRecord TRec;
+ typedef TDatMerger<TRecord, TInput, TCompare, TSieve> TMerger;
+ typedef TSimpleSharedPtr<TMerger> TMergerPtr;
+ typedef TPowerMerger<TRecord, TCompare, TSieve, TInput, TOutput> TFileMerger;
+
+ struct TMergePortionTask: public IObjectInQueue {
+ TFileMerger* FileMerger;
+ int Begin;
+ int End;
+ TString OutFname;
+
+ TMergePortionTask(TFileMerger* fileMerger, int begin, int end, const TString& outFname)
+ : FileMerger(fileMerger)
+ , Begin(begin)
+ , End(end)
+ , OutFname(outFname)
+ {
+ }
+
+ void Process(void*) override {
+ THolder<TMergePortionTask> This(this);
+ //fprintf(stderr, "MergePortion: (%i, %i, %s)\n", Begin, End, ~OutFname);
+ FileMerger->MergePortion(Begin, End, OutFname);
+ }
+ };
+
+public:
+ TPowerMerger(const TSimpleSharedPtr<TThreadPool>& mtpQueue, const TSimpleSharedPtr<TPortionManager>& portMan,
+ int memory, int pageSize, bool autoUnlink)
+ : MtpQueue(mtpQueue)
+ , PortionManager(portMan)
+ , Memory(memory)
+ , PageSize(pageSize)
+ , AutoUnlink(autoUnlink)
+ {
+ }
+
+ TPowerMerger(const TSimpleSharedPtr<TThreadPool>& mtpQueue, const char* tempDir,
+ int memory, int pageSize, bool autoUnlink)
+ : MtpQueue(mtpQueue)
+ , PortionManager(new TPortionManager)
+ , Memory(memory)
+ , PageSize(pageSize)
+ , AutoUnlink(autoUnlink)
+ {
+ PortionManager->Open(tempDir);
+ }
+
+ ~TPowerMerger() {
+ Close();
+ }
+
+ void SetMtpQueue(const TSimpleSharedPtr<TThreadPool>& mtpQueue) {
+ MtpQueue = mtpQueue;
+ }
+
+ void MergePortion(int begin, int end, const TString& outFname) {
+ TMerger merger;
+ InitMerger(merger, begin, end);
+
+ TOutput out("mergeportion-tmpout", PageSize, BufSize, 0);
+ out.Open(outFname.data());
+ const TRec* rec;
+ while ((rec = merger.Next()))
+ out.Push(rec);
+ out.Close();
+
+ merger.Close();
+
+ {
+ TGuard<TMutex> guard(Mutex);
+ UnlinkFiles(begin, end);
+ Files.push_back(outFname);
+ --Tasks;
+ TaskFinishedCond.Signal();
+ }
+ }
+
+ void Add(const TString& fname) {
+ TGuard<TMutex> guard(Mutex);
+ // fprintf(stderr, "TPowerMerger::Add: %s\n", ~fname);
+ Files.push_back(fname);
+ if (InitialFilesEnd > 0)
+ ythrow yexception() << "TPowerMerger::Add: no more files allowed";
+ }
+
+ void Merge(int maxPortions) {
+ TGuard<TMutex> guard(Mutex);
+ InitialFilesEnd = Files.ysize();
+ if (!InitialFilesEnd)
+ ythrow yexception() << "TPowerMerger::Merge: no files added";
+ Optimize(maxPortions);
+ MergeMT();
+ InitMerger(Merger, CPortions, Files.ysize());
+ }
+
+ void Close() {
+ TGuard<TMutex> guard(Mutex);
+ Merger.Close();
+ UnlinkFiles(CPortions, Files.ysize());
+ InitialFilesEnd = CPortions = 0;
+ Files.clear();
+ }
+
+ const TRec* Next() {
+ return Merger.Next();
+ }
+
+ const TRec* Current() {
+ return Merger.Current();
+ }
+
+ int FileCount() const {
+ TGuard<TMutex> guard(Mutex);
+ return Files.ysize();
+ }
+
+private:
+ void InitMerger(TMerger& merger, int begin, int end) {
+ TGuard<TMutex> guard(Mutex);
+ TVector<TSimpleSharedPtr<TInput>> inputs;
+ for (int i = begin; i < end; ++i) {
+ inputs.push_back(new TInput("mergeportion-tmpin", BufSize, 0));
+ inputs.back()->Open(Files[i]);
+ // fprintf(stderr, "InitMerger: %i, %s\n", i, ~Files[i]);
+ }
+ merger.Init(inputs);
+ }
+
+ void UnlinkFiles(int begin, int end) {
+ TGuard<TMutex> guard(Mutex);
+ for (int i = begin; i < end; ++i) {
+ if (i >= InitialFilesEnd || AutoUnlink)
+ unlink(Files[i].c_str());
+ }
+ }
+
+ void Optimize(int maxPortions, size_t maxBufSize = 4u << 20) {
+ TGuard<TMutex> guard(Mutex);
+ maxPortions = std::min(maxPortions, Memory / PageSize - 1);
+ maxBufSize = std::max((size_t)PageSize, maxBufSize);
+
+ if (maxPortions <= 2) {
+ FPortions = MPortions = 2;
+ BufSize = PageSize;
+ return;
+ }
+
+ int Portions = Files.ysize();
+ if (maxPortions >= Portions) {
+ FPortions = MPortions = Portions;
+ } else if (((Portions + maxPortions - 1) / maxPortions) <= maxPortions) {
+ while (((Portions + maxPortions - 1) / maxPortions) <= maxPortions)
+ --maxPortions;
+ MPortions = ++maxPortions;
+ int total = ((Portions + MPortions - 1) / MPortions) + Portions;
+ FPortions = (total % MPortions) ? (total % MPortions) : (int)MPortions;
+ } else
+ FPortions = MPortions = maxPortions;
+
+ BufSize = std::min((size_t)(Memory / (MPortions + 1)), maxBufSize);
+ // fprintf(stderr, "Optimize: Portions=%i; MPortions=%i; FPortions=%i; Memory=%i; BufSize=%i\n",
+ // (int)Portions, (int)MPortions, (int)FPortions, (int)Memory, (int)BufSize);
+ }
+
+ void MergeMT() {
+ TGuard<TMutex> guard(Mutex);
+ do {
+ int n;
+ while ((n = Files.ysize() - CPortions) > MPortions) {
+ int m = std::min((CPortions == 0 ? (int)FPortions : (int)MPortions), n);
+ TString fname = PortionManager->Next();
+ if (!MtpQueue->Add(new TMergePortionTask(this, CPortions, CPortions + m, fname)))
+ ythrow yexception() << "TPowerMerger::MergeMT: failed to add task";
+ CPortions += m;
+ ++Tasks;
+ }
+ if (Tasks > 0)
+ TaskFinishedCond.Wait(Mutex);
+ } while (Tasks > 0);
+ }
+
+private:
+ TMutex Mutex;
+ TCondVar TaskFinishedCond;
+
+ TMerger Merger;
+ TSimpleSharedPtr<TThreadPool> MtpQueue;
+ TSimpleSharedPtr<TPortionManager> PortionManager;
+ TVector<TString> Files;
+ int Tasks = 0;
+ int InitialFilesEnd = 0;
+ int CPortions = 0;
+ int MPortions = 0;
+ int FPortions = 0;
+ int Memory = 0;
+ int PageSize = 0;
+ int BufSize = 0;
+ bool AutoUnlink = false;
+};
+
+// A sorter powered by threads
+template <
+ class TRecord,
+ template <typename T> class TCompare,
+ class TSieve = TFakeSieve<TRecord>,
+ class TTmpInput = TInDatFile<TRecord>,
+ class TTmpOutput = TOutDatFile<TRecord>>
+class TPowerSorter {
+public:
+ typedef TPowerSorter<TRecord, TCompare, TSieve, TTmpInput, TTmpOutput> TSorter;
+ typedef TRecord TRec;
+ typedef TTmpOutput TTmpOut;
+ typedef TTmpInput TTmpIn;
+ typedef TDatSorterBuf<TRecord, TCompare, TSieve> TSorterBuf;
+ typedef TCompare<TRecord> TComp;
+ typedef TPowerMerger<TRecord, TCompare, TSieve, TTmpInput, TTmpOutput> TFileMerger;
+
+ struct TSortPortionTask: public IObjectInQueue {
+ TSorter* Sorter;
+ TSorterBuf* SorterBuf;
+ int Portion;
+
+ TSortPortionTask(TSorter* sorter, TSorterBuf* sorterBuf, int portion)
+ : Sorter(sorter)
+ , SorterBuf(sorterBuf)
+ , Portion(portion)
+ {
+ }
+
+ void Process(void*) override {
+ TAutoPtr<TSortPortionTask> This(this);
+ // fprintf(stderr, "SortPortion: %i\n", Portion);
+ Sorter->SortPortion(SorterBuf);
+ }
+ };
+
+ class TSorterBufQueue {
+ private:
+ TMutex Mutex;
+ TCondVar Cond;
+ TVector<TSimpleSharedPtr<TSorterBuf>> V;
+ TDeque<TSorterBuf*> Q;
+
+ int Memory, PageSize, MaxSorterBufs;
+
+ public:
+ TSorterBufQueue(int memory, int pageSize, int maxSorterBufs)
+ : Memory(memory)
+ , PageSize(pageSize)
+ , MaxSorterBufs(maxSorterBufs)
+ {
+ }
+
+ void Push(TSorterBuf* sb) {
+ TGuard<TMutex> guard(Mutex);
+ sb->Clear();
+ Q.push_back(sb);
+ Cond.Signal();
+ }
+
+ TSorterBuf* Pop() {
+ TGuard<TMutex> guard(Mutex);
+ if (!Q.size() && V.ysize() < MaxSorterBufs) {
+ V.push_back(new TSorterBuf(Memory / MaxSorterBufs, PageSize));
+ return V.back().Get();
+ } else {
+ while (!Q.size())
+ Cond.Wait(Mutex);
+ TSorterBuf* t = Q.front();
+ Q.pop_front();
+ return t;
+ }
+ }
+
+ void Clear() {
+ TGuard<TMutex> guard(Mutex);
+ Q.clear();
+ V.clear();
+ }
+
+ void WaitAll() {
+ TGuard<TMutex> guard(Mutex);
+ while (Q.size() < V.size()) {
+ Cond.Wait(Mutex);
+ }
+ }
+
+ int GetMaxSorterBufs() const {
+ return MaxSorterBufs;
+ }
+ };
+
+public:
+ TPowerSorter(const TSimpleSharedPtr<TThreadPool>& mtpQueue, size_t maxSorterBufs,
+ const char* name, size_t memory, size_t pageSize, size_t bufSize)
+ : MaxSorterBufs(maxSorterBufs)
+ , Name(name)
+ , Memory(memory)
+ , PageSize(pageSize)
+ , BufSize(bufSize)
+ , MtpQueue(mtpQueue)
+ , PortionManager(new TPortionManager)
+ , SBQueue(Memory, PageSize, MaxSorterBufs)
+ , FileMerger(MtpQueue, PortionManager, Memory, PageSize, true)
+ {
+ }
+
+ TPowerSorter(size_t maxSorterBufs,
+ const char* name, size_t memory, size_t pageSize, size_t bufSize)
+ : MaxSorterBufs(maxSorterBufs)
+ , Name(name)
+ , Memory(memory)
+ , PageSize(pageSize)
+ , BufSize(bufSize)
+ , PortionManager(new TPortionManager)
+ , SBQueue(Memory, PageSize, maxSorterBufs)
+ , FileMerger(MtpQueue, PortionManager, Memory, PageSize, true)
+ {
+ }
+
+ TPowerSorter(const char* name, size_t memory, size_t pageSize, size_t bufSize)
+ : MaxSorterBufs(5)
+ , Name(name)
+ , Memory(memory)
+ , PageSize(pageSize)
+ , BufSize(bufSize)
+ , PortionManager(new TPortionManager)
+ , SBQueue(Memory, PageSize, MaxSorterBufs)
+ , FileMerger(MtpQueue, PortionManager, Memory, PageSize, true)
+ {
+ }
+
+ ~TPowerSorter() {
+ Close();
+ }
+
+ void Open(const char* tempDir) {
+ Close();
+ CurSB = SBQueue.Pop();
+ PortionManager->Open(tempDir);
+ }
+
+ void Reopen(const char* fname) {
+ Open(fname);
+ }
+
+ void Close() {
+ CurSB = nullptr;
+ SBQueue.Clear();
+ PortionCount = 0;
+ FileMerger.Close();
+ PortionManager->Close();
+ }
+
+ const TRec* Push(const TRec* v) {
+ CheckOpen("Push");
+ const TRec* u = CurSB->Push(v);
+ if (!u) {
+ NextPortion();
+ u = CurSB->Push(v);
+ }
+ return u;
+ }
+
+ void Sort(int maxPortions = 1000) {
+ CheckOpen("Sort");
+ if (!PortionCount) {
+ CurSB->Sort();
+ } else {
+ NextPortion();
+ SBQueue.Push(CurSB);
+ CurSB = nullptr;
+ SBQueue.WaitAll();
+ SBQueue.Clear();
+ FileMerger.Merge(maxPortions);
+ }
+ }
+
+ const TRec* Next() {
+ return PortionCount ? FileMerger.Next() : CurSB->Next();
+ }
+
+ const TRec* Current() {
+ return PortionCount ? FileMerger.Current() : CurSB->Current();
+ }
+
+ int GetBufSize() const {
+ return BufSize;
+ }
+
+ int GetPageSize() const {
+ return PageSize;
+ }
+
+ const char* GetName() const {
+ return Name.data();
+ }
+
+private:
+ void CheckOpen(const char* m) {
+ if (!CurSB)
+ ythrow yexception() << "TPowerSorter::" << m << ": the sorter is not open";
+ }
+
+ void NextPortion() {
+ if (!CurSB->Size())
+ return;
+ ++PortionCount;
+ if (MaxSorterBufs <= 1) {
+ SortPortion(CurSB);
+ } else {
+ if (!MtpQueue.Get()) {
+ MtpQueue.Reset(new TThreadPool);
+ MtpQueue->Start(MaxSorterBufs - 1);
+ FileMerger.SetMtpQueue(MtpQueue);
+ }
+ if (!MtpQueue->Add(new TSortPortionTask(this, CurSB, PortionCount)))
+ ythrow yexception() << "TPowerSorter::NextPortion: failed to add task";
+ }
+ CurSB = SBQueue.Pop();
+ }
+
+ void SortPortion(TSorterBuf* sorterBuf) {
+ TString portionFilename = PortionManager->Next();
+ try {
+ sorterBuf->Sort();
+
+ // fprintf(stderr, "TPowerSorter::SortPortion: -> %s\n", ~portionFilename);
+ TTmpOut out("powersorter-portion", PageSize, BufSize, 0);
+ out.Open(portionFilename.data());
+
+ while (sorterBuf->Next())
+ out.Push(sorterBuf->Current());
+
+ out.Close();
+ FileMerger.Add(portionFilename);
+ SBQueue.Push(sorterBuf);
+ } catch (const yexception& e) {
+ unlink(portionFilename.data());
+ ythrow yexception() << "SortPortion: " << e.what();
+ }
+ }
+
+private:
+ int MaxSorterBufs = 0;
+ TString Name;
+ int Memory = 0;
+ int PageSize = 0;
+ int BufSize = 0;
+
+ TMutex Mutex;
+ TSimpleSharedPtr<TThreadPool> MtpQueue;
+ TSimpleSharedPtr<TPortionManager> PortionManager;
+
+ TSorterBufQueue SBQueue;
+ TSorterBuf* CurSB = nullptr;
+ int PortionCount = 0;
+
+ TFileMerger FileMerger;
+};
diff --git a/library/cpp/microbdb/reader.h b/library/cpp/microbdb/reader.h
new file mode 100644
index 0000000000..694a2f1766
--- /dev/null
+++ b/library/cpp/microbdb/reader.h
@@ -0,0 +1,354 @@
+#pragma once
+
+#include "align.h"
+#include "header.h"
+#include "extinfo.h"
+
+#include <contrib/libs/zlib/zlib.h>
+#include <contrib/libs/fastlz/fastlz.h>
+#include <contrib/libs/snappy/snappy.h>
+
+#include <util/generic/vector.h>
+#include <util/memory/tempbuf.h>
+
+namespace NMicroBDB {
+ static const size_t DEFAULT_BUFFER_SIZE = (64 << 10);
+
+ //!
+ template <class TVal>
+ class IBasePageReader {
+ public:
+ virtual size_t GetRecSize() const = 0;
+ virtual size_t GetExtSize() const = 0;
+ virtual bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const = 0;
+ virtual const ui8* GetExtInfoRaw(size_t* len) const = 0;
+ virtual const TVal* Next() = 0;
+ virtual void Reset() = 0;
+ //! set clearing flag, so temporary buffers will be cleared
+ //! in next call of Next()
+ virtual void SetClearFlag() {
+ }
+
+ virtual ~IBasePageReader() {
+ }
+ };
+
+ template <class TVal, typename TPageIter>
+ class TRawPageReader: public IBasePageReader<TVal> {
+ public:
+ TRawPageReader(TPageIter* const iter)
+ : PageIter(iter)
+ {
+ Reset();
+ }
+
+ bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const override {
+ Y_VERIFY(TExtInfoType<TVal>::Exists, "GetExtInfo should only be used with extended records");
+ if (!Rec)
+ return false;
+ ui8* raw = (ui8*)Rec + RecSize + ExtLenSize;
+ return extInfo->ParseFromArray(raw, ExtSize);
+ }
+
+ size_t GetRecSize() const override {
+ return RecSize + ExtLenSize;
+ }
+
+ size_t GetExtSize() const override {
+ return ExtSize;
+ }
+
+ const ui8* GetExtInfoRaw(size_t* len) const override {
+ Y_VERIFY(TExtInfoType<TVal>::Exists, "GetExtInfo should only be used with extended records");
+ if (!Rec) {
+ *len = 0;
+ return nullptr;
+ }
+ *len = ExtLenSize + ExtSize;
+ return (ui8*)Rec + RecSize;
+ }
+
+ const TVal* Next() override {
+ if (!Rec)
+ Rec = (TVal*)((char*)PageIter->Current() + sizeof(TDatPage));
+ else
+ Rec = (TVal*)((char*)Rec + DatCeil(RecSize + ExtLenSize + ExtSize));
+ if (!TExtInfoType<TVal>::Exists)
+ RecSize = SizeOf(Rec);
+ else
+ RecSize = SizeOfExt(Rec, &ExtLenSize, &ExtSize);
+ return Rec;
+ }
+
+ void Reset() override {
+ Rec = nullptr;
+ RecSize = 0;
+ ExtLenSize = 0;
+ ExtSize = 0;
+ }
+
+ private:
+ const TVal* Rec;
+ size_t RecSize;
+ size_t ExtLenSize;
+ size_t ExtSize;
+ TPageIter* const PageIter;
+ };
+
+ template <class TVal, typename TPageIter>
+ class TCompressedReader: public IBasePageReader<TVal> {
+ inline size_t GetFirstRecordSize(const TVal* const in) const {
+ if (!TExtInfoType<TVal>::Exists) {
+ return DatCeil(SizeOf(in));
+ } else {
+ size_t ll;
+ size_t l;
+ size_t ret = SizeOfExt(in, &ll, &l);
+
+ return DatCeil(ret + ll + l);
+ }
+ }
+
+ void DecompressBlock() {
+ if (PageIter->IsFrozen() && Buffer.Get())
+ Blocks.push_back(Buffer.Release());
+
+ const TCompressedHeader* hdr = (const TCompressedHeader*)(Page);
+
+ Page += sizeof(TCompressedHeader);
+
+ const size_t first = GetFirstRecordSize((const TVal*)Page);
+
+ if (!Buffer.Get() || Buffer->Size() < hdr->Original)
+ Buffer.Reset(new TTempBuf(Max<size_t>(hdr->Original, DEFAULT_BUFFER_SIZE)));
+
+ memcpy(Buffer->Data(), Page, first);
+ Page += first;
+
+ if (hdr->Count > 1) {
+ switch (Algo) {
+ case MBDB_COMPRESSION_ZLIB: {
+ uLongf dst = hdr->Original - first;
+
+ int ret = uncompress((Bytef*)Buffer->Data() + first, &dst, Page, hdr->Compressed);
+
+ if (ret != Z_OK)
+ ythrow yexception() << "error then uncompress " << ret;
+ } break;
+ case MBDB_COMPRESSION_FASTLZ: {
+ int dst = hdr->Original - first;
+ int ret = yfastlz_decompress(Page, hdr->Compressed, Buffer->Data() + first, dst);
+
+ if (!ret)
+ ythrow yexception() << "error then uncompress";
+ } break;
+ case MBDB_COMPRESSION_SNAPPY: {
+ if (!snappy::RawUncompress((const char*)Page, hdr->Compressed, Buffer->Data() + first))
+ ythrow yexception() << "error then uncompress";
+ } break;
+ }
+ }
+
+ Rec = nullptr;
+ RecNum = hdr->Count;
+ Page += hdr->Compressed;
+ }
+
+ void ClearBuffer() {
+ for (size_t i = 0; i < Blocks.size(); ++i)
+ delete Blocks[i];
+ Blocks.clear();
+ ClearFlag = false;
+ }
+
+ public:
+ TCompressedReader(TPageIter* const iter)
+ : Rec(nullptr)
+ , RecSize(0)
+ , ExtLenSize(0)
+ , ExtSize(0)
+ , Page(nullptr)
+ , PageIter(iter)
+ , RecNum(0)
+ , BlockNum(0)
+ , ClearFlag(false)
+ {
+ }
+
+ ~TCompressedReader() override {
+ ClearBuffer();
+ }
+
+ size_t GetRecSize() const override {
+ return RecSize + ExtLenSize;
+ }
+
+ size_t GetExtSize() const override {
+ return ExtSize;
+ }
+
+ bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const override {
+ Y_VERIFY(TExtInfoType<TVal>::Exists, "GetExtInfo should only be used with extended records");
+ if (!Rec)
+ return false;
+ ui8* raw = (ui8*)Rec + RecSize + ExtLenSize;
+ return extInfo->ParseFromArray(raw, ExtSize);
+ }
+
+ const ui8* GetExtInfoRaw(size_t* len) const override {
+ Y_VERIFY(TExtInfoType<TVal>::Exists, "GetExtInfo should only be used with extended records");
+ if (!Rec) {
+ *len = 0;
+ return nullptr;
+ }
+ *len = ExtLenSize + ExtSize;
+ return (ui8*)Rec + RecSize;
+ }
+
+ const TVal* Next() override {
+ Y_ASSERT(RecNum >= 0);
+
+ if (ClearFlag)
+ ClearBuffer();
+
+ if (!Page) {
+ if (!PageIter->Current())
+ return nullptr;
+
+ Page = (ui8*)PageIter->Current() + sizeof(TDatPage);
+
+ BlockNum = ((TCompressedPage*)Page)->BlockCount - 1;
+ Algo = (ECompressionAlgorithm)((TCompressedPage*)Page)->Algorithm;
+ Page += sizeof(TCompressedPage);
+
+ DecompressBlock();
+ }
+
+ if (!RecNum) {
+ if (BlockNum <= 0)
+ return nullptr;
+ else {
+ --BlockNum;
+ DecompressBlock();
+ }
+ }
+
+ --RecNum;
+ if (!Rec)
+ Rec = (const TVal*)Buffer->Data();
+ else
+ Rec = (const TVal*)((char*)Rec + DatCeil(RecSize + ExtLenSize + ExtSize));
+
+ if (!TExtInfoType<TVal>::Exists)
+ RecSize = SizeOf(Rec);
+ else
+ RecSize = SizeOfExt(Rec, &ExtLenSize, &ExtSize);
+
+ return Rec;
+ }
+
+ void Reset() override {
+ Page = nullptr;
+ BlockNum = 0;
+ Rec = nullptr;
+ RecSize = 0;
+ ExtLenSize = 0;
+ ExtSize = 0;
+ RecNum = 0;
+ }
+
+ void SetClearFlag() override {
+ ClearFlag = true;
+ }
+
+ public:
+ THolder<TTempBuf> Buffer;
+ TVector<TTempBuf*> Blocks;
+ const TVal* Rec;
+ size_t RecSize;
+ size_t ExtLenSize;
+ size_t ExtSize;
+ const ui8* Page;
+ TPageIter* const PageIter;
+ int RecNum; //!< count of recs in current block
+ int BlockNum;
+ ECompressionAlgorithm Algo;
+ bool ClearFlag;
+ };
+
+ class TZLibCompressionImpl {
+ public:
+ static const ECompressionAlgorithm Code = MBDB_COMPRESSION_ZLIB;
+
+ inline void Init() {
+ // -
+ }
+
+ inline void Term() {
+ // -
+ }
+
+ inline size_t CompressBound(size_t size) const noexcept {
+ return ::compressBound(size);
+ }
+
+ inline void Compress(void* out, size_t& outSize, const void* in, size_t inSize) {
+ uLongf size = outSize;
+
+ if (compress((Bytef*)out, &size, (const Bytef*)in, inSize) != Z_OK)
+ ythrow yexception() << "not compressed";
+ outSize = size;
+ }
+ };
+
+ class TFastlzCompressionImpl {
+ public:
+ static const ECompressionAlgorithm Code = MBDB_COMPRESSION_FASTLZ;
+
+ inline void Init() {
+ // -
+ }
+
+ inline void Term() {
+ // -
+ }
+
+ inline size_t CompressBound(size_t size) const noexcept {
+ size_t rval = size_t(size * 1.07);
+ return rval < 66 ? 66 : rval;
+ }
+
+ inline void Compress(void* out, size_t& outSize, const void* in, size_t inSize) {
+ outSize = yfastlz_compress_level(2, in, inSize, out);
+ if (!outSize)
+ ythrow yexception() << "not compressed";
+ }
+ };
+
+ class TSnappyCompressionImpl {
+ public:
+ static const ECompressionAlgorithm Code = MBDB_COMPRESSION_SNAPPY;
+
+ inline void Init() {
+ // -
+ }
+
+ inline void Term() {
+ // -
+ }
+
+ inline size_t CompressBound(size_t size) const noexcept {
+ return snappy::MaxCompressedLength(size);
+ }
+
+ inline void Compress(void* out, size_t& outSize, const void* in, size_t inSize) {
+ snappy::RawCompress((const char*)in, inSize, (char*)out, &outSize);
+ }
+ };
+
+}
+
+using TFakeCompression = void;
+using TZLibCompression = NMicroBDB::TZLibCompressionImpl;
+using TFastlzCompression = NMicroBDB::TFastlzCompressionImpl;
+using TSnappyCompression = NMicroBDB::TSnappyCompressionImpl;
diff --git a/library/cpp/microbdb/safeopen.h b/library/cpp/microbdb/safeopen.h
new file mode 100644
index 0000000000..c328ffd575
--- /dev/null
+++ b/library/cpp/microbdb/safeopen.h
@@ -0,0 +1,792 @@
+#pragma once
+
+// util
+#include <util/generic/yexception.h>
+#include <util/generic/vector.h>
+#include <util/string/util.h>
+#include <util/system/mutex.h>
+#include <thread>
+
+#include "microbdb.h"
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4706) /*assignment within conditional expression*/
+#pragma warning(disable : 4267) /*conversion from 'size_t' to 'type', possible loss of data*/
+#endif
+
+template <typename TVal, typename TPageFile = TInputPageFile, typename TIterator = TInputPageIterator<TPageFile>>
+class TInDatFile: protected TInDatFileImpl<TVal, TInputRecordIterator<TVal, TIterator>> {
+public:
+ typedef TVal TRec;
+ typedef TInDatFileImpl<TVal, TInputRecordIterator<TVal, TIterator>> TBase;
+
+ TInDatFile(const TString& name, size_t pages, int pagesOrBytes = 1)
+ : Name(name)
+ , Pages(pages)
+ , PagesOrBytes(pagesOrBytes)
+ {
+ }
+
+ ~TInDatFile() {
+ Close();
+ }
+
+ void Open(const TString& fname, bool direct = false) {
+ ui32 gotRecordSig = 0;
+ int ret = TBase::Open(fname.data(), Pages, PagesOrBytes, &gotRecordSig, direct);
+ if (ret) {
+ // XXX: print record type name, not type sig
+ ythrow yexception() << ErrorMessage(ret, "Failed to open input file", fname, TVal::RecordSig, gotRecordSig);
+ }
+ Name = fname;
+ }
+
+ void OpenStream(TAutoPtr<IInputStream> input) {
+ ui32 gotRecordSig = 0;
+ int ret = TBase::Open(input, Pages, PagesOrBytes, &gotRecordSig);
+ if (ret) {
+ // XXX: print record type name, not type sig
+ ythrow yexception() << ErrorMessage(ret, "Failed to open input file", Name, TVal::RecordSig, gotRecordSig);
+ }
+ }
+
+ void Close() {
+ int ret;
+ if (IsOpen() && (ret = TBase::GetError()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error before closing input file", Name);
+ if ((ret = TBase::Close()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error while closing input file", Name);
+ }
+
+ const char* GetName() const {
+ return Name.data();
+ }
+
+ using TBase::Current;
+ using TBase::Freeze;
+ using TBase::GetError;
+ using TBase::GetExtInfo;
+ using TBase::GetExtInfoRaw;
+ using TBase::GetExtSize;
+ using TBase::GetLastPage;
+ using TBase::GetPageNum;
+ using TBase::GetPageSize;
+ using TBase::GetRecSize;
+ using TBase::GotoLastPage;
+ using TBase::GotoPage;
+ using TBase::IsEof;
+ using TBase::IsOpen;
+ using TBase::Next;
+ using TBase::Skip;
+ using TBase::Unfreeze;
+
+protected:
+ TString Name;
+ size_t Pages;
+ int PagesOrBytes;
+};
+
+template <typename TVal>
+class TMappedInDatFile: protected TInDatFileImpl<TVal, TInputRecordIterator<TVal, TMappedInputPageIterator<TMappedInputPageFile>>> {
+public:
+ typedef TVal TRec;
+ typedef TInDatFileImpl<TVal, TInputRecordIterator<TVal, TMappedInputPageIterator<TMappedInputPageFile>>> TBase;
+
+ TMappedInDatFile(const TString& name, size_t /* pages */, int /* pagesOrBytes */)
+ : Name(name)
+ {
+ }
+
+ ~TMappedInDatFile() {
+ Close();
+ }
+
+ void Open(const TString& fname) {
+ int ret = TBase::Open(fname.data());
+ if (ret)
+ ythrow yexception() << ErrorMessage(ret, "Failed to open mapped file", fname, TVal::RecordSig);
+ Name = fname;
+ }
+
+ void Close() {
+ int ret;
+ if (IsOpen() && (ret = TBase::GetError()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error before closing mapped file", Name);
+ if ((ret = TBase::Close()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error while closing mapped file", Name);
+ }
+
+ const char* GetName() const {
+ return Name.data();
+ }
+
+ using TBase::Current;
+ using TBase::GetError;
+ using TBase::GetExtInfo;
+ using TBase::GetExtInfoRaw;
+ using TBase::GetLastPage;
+ using TBase::GetPageNum;
+ using TBase::GetPageSize;
+ using TBase::GotoLastPage;
+ using TBase::GotoPage;
+ using TBase::IsEof;
+ using TBase::IsOpen;
+ using TBase::Next;
+ using TBase::Skip;
+
+protected:
+ TString Name;
+};
+
+template <typename TVal, typename TCompressor = TFakeCompression, typename TPageFile = TOutputPageFile>
+class TOutDatFile: protected TOutDatFileImpl<TVal, TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TFakeIndexer, TCompressor>> {
+public:
+ typedef TOutDatFileImpl<TVal, TOutputRecordIterator<TVal, TOutputPageIterator<TPageFile>, TFakeIndexer, TCompressor>> TBase;
+
+ TOutDatFile(const TString& name, size_t pagesize, size_t pages, int pagesOrBytes = 1)
+ : Name(name)
+ , PageSize(pagesize)
+ , Pages(pages)
+ , PagesOrBytes(pagesOrBytes)
+ {
+ }
+
+ ~TOutDatFile() {
+ Close();
+ }
+
+ void Open(const char* fname, bool direct = false) {
+ int ret = TBase::Open(fname, PageSize, Pages, PagesOrBytes, direct);
+ if (ret)
+ ythrow yexception() << ErrorMessage(ret, "Failed to open output file", fname);
+ Name = fname;
+ }
+
+ void Open(const TString& fname) {
+ Open(fname.data());
+ }
+
+ void OpenStream(TAutoPtr<IOutputStream> output) {
+ int ret = TBase::Open(output, PageSize, Pages, PagesOrBytes);
+ if (ret)
+ ythrow yexception() << ErrorMessage(ret, "Failed to open output stream", Name);
+ }
+
+ void Close() {
+ int ret;
+ if ((ret = TBase::GetError()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error before closing output file", Name);
+ if ((ret = TBase::Close()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error while closing output file", Name);
+ }
+
+ const char* GetName() const {
+ return Name.data();
+ }
+
+ using TBase::Freeze;
+ using TBase::GetError;
+ using TBase::GetPageSize;
+ using TBase::IsEof;
+ using TBase::IsOpen;
+ using TBase::Offset;
+ using TBase::Push;
+ using TBase::PushWithExtInfo;
+ using TBase::Reserve;
+ using TBase::Unfreeze;
+
+protected:
+ TString Name;
+ size_t PageSize, Pages;
+ int PagesOrBytes;
+};
+
+template <typename TVal, typename TCompressor, typename TPageFile>
+class TOutDatFileArray;
+
+template <typename TVal, typename TCompressor = TFakeCompression, typename TPageFile = TOutputPageFile>
+class TOutDatFileArray {
+ typedef TOutDatFile<TVal, TCompressor, TPageFile> TFileType;
+
+public:
+ TOutDatFileArray(const TString& name, size_t pagesize, size_t pages, int pagesOrBytes = 1)
+ : Name(name)
+ , PageSize(pagesize)
+ , Pages(pages)
+ , PagesOrBytes(pagesOrBytes)
+ , NumFiles(0)
+ , Files(nullptr)
+ {
+ }
+
+ ~TOutDatFileArray() {
+ for (int i = 0; i < NumFiles; ++i) {
+ Files[i].Close();
+ Files[i].~TFileType();
+ }
+ free(Files);
+ Files = nullptr;
+ NumFiles = 0;
+ }
+
+ TFileType& operator[](size_t pos) {
+ return Files[pos];
+ }
+
+ void Open(int n, const TString& fname) {
+ char temp[FILENAME_MAX];
+
+ Name = fname;
+ NumFiles = CreateDatObjects(n, fname);
+
+ int i;
+ try {
+ for (i = 0; i < NumFiles; ++i) {
+ sprintf(temp, fname.data(), i);
+ Files[i].Open(temp);
+ }
+ } catch (...) {
+ while (--i >= 0)
+ Files[i].Close();
+ throw;
+ }
+ }
+
+ template <typename TNameBuilder>
+ void OpenWithCallback(int n, const TNameBuilder& builder) {
+ NumFiles = CreateDatObjects(n, Name);
+
+ for (int i = 0; i < NumFiles; ++i)
+ Files[i].Open(builder.GetName(i).data());
+ }
+
+ void Close() {
+ for (int i = 0; i < NumFiles; ++i)
+ Files[i].Close();
+ }
+
+ void CloseMT(ui32 threads) {
+ int current = 0;
+ TMutex mutex;
+ TVector<std::thread> thrs;
+ thrs.reserve(threads);
+ for (ui32 i = 0; i < threads; i++) {
+ thrs.emplace_back([this, &current, &mutex]() {
+ while (true) {
+ mutex.Acquire();
+ int cur = current++;
+ mutex.Release();
+ if (cur >= NumFiles)
+ break;
+ Files[cur].Close();
+ }
+ });
+ }
+ for (auto& thread : thrs) {
+ thread.join();
+ }
+ }
+
+ const char* GetName() const {
+ return Name.data();
+ }
+
+protected:
+ int CreateDatObjects(int n, const TString& fname) {
+ if (!(Files = (TFileType*)malloc(n * sizeof(TFileType))))
+ ythrow yexception() << "can't alloc \"" << fname << "\" file array: " << LastSystemErrorText();
+ int num = 0;
+ char temp[FILENAME_MAX];
+ for (int i = 0; i < n; ++i, ++num) {
+ sprintf(temp, "%s[%d]", fname.data(), i);
+ new (Files + i) TFileType(temp, PageSize, Pages, PagesOrBytes);
+ }
+ return num;
+ }
+
+ TString Name;
+ size_t PageSize, Pages;
+ int PagesOrBytes, NumFiles;
+ TFileType* Files;
+};
+
+template <typename TVal, typename TKey, typename TCompressor = TFakeCompression, typename TPageFile = TOutputPageFile>
+class TOutDirectFile: protected TOutDirectFileImpl<TVal, TKey, TCompressor, TPageFile> {
+ typedef TOutDirectFileImpl<TVal, TKey, TCompressor, TPageFile> TBase;
+
+public:
+ TOutDirectFile(const TString& name, size_t pagesize, size_t pages, size_t ipagesize, size_t ipages, int pagesOrBytes)
+ : Name(name)
+ , PageSize(pagesize)
+ , Pages(pages)
+ , IdxPageSize(ipagesize)
+ , IdxPages(ipages)
+ , PagesOrBytes(pagesOrBytes)
+ {
+ }
+
+ ~TOutDirectFile() {
+ Close();
+ }
+
+ void Open(const TString& fname) {
+ int ret = TBase::Open(fname.data(), PageSize, Pages, IdxPageSize, IdxPages, PagesOrBytes);
+ if (ret)
+ ythrow yexception() << ErrorMessage(ret, "Failed to open output file", fname);
+ Name = fname;
+ }
+
+ void Close() {
+ int ret;
+ if ((ret = TBase::GetError()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error before closing output file", Name);
+ if ((ret = TBase::Close()))
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret, "Error while closing output file", Name);
+ }
+
+ const char* GetName() const {
+ return Name.data();
+ }
+
+ using TBase::Freeze;
+ using TBase::Push;
+ using TBase::PushWithExtInfo;
+ using TBase::Reserve;
+ using TBase::Unfreeze;
+
+protected:
+ TString Name;
+ size_t PageSize, Pages, IdxPageSize, IdxPages;
+ int PagesOrBytes;
+};
+
+template <
+ typename TVal,
+ template <typename T> class TComparer,
+ typename TCompress = TFakeCompression,
+ typename TSieve = TFakeSieve<TVal>,
+ typename TPageFile = TOutputPageFile,
+ typename TFileTypes = TDefInterFileTypes>
+class TDatSorter: protected TDatSorterImpl<TVal, TComparer<TVal>, TCompress, TSieve, TPageFile, TFileTypes> {
+ typedef TDatSorterImpl<TVal, TComparer<TVal>, TCompress, TSieve, TPageFile, TFileTypes> TBase;
+
+public:
+ typedef TVal TRec;
+
+public:
+ TDatSorter(const TString& name, size_t memory, size_t pagesize, size_t pages, int pagesOrBytes = 1)
+ : Name(name)
+ , Memory(memory)
+ , PageSize(pagesize)
+ , Pages(pages)
+ , PagesOrBytes(pagesOrBytes)
+ {
+ Templ[0] = 0;
+ }
+
+ ~TDatSorter() {
+ Close();
+ Templ[0] = 0;
+ }
+
+ void Open(const TString& dirName) {
+ int ret;
+ if (ret = MakeSorterTempl(Templ, dirName.data())) {
+ Templ[0] = 0;
+ ythrow yexception() << ErrorMessage(ret, Name + " sorter: bad tempdir", dirName);
+ }
+ if ((ret = TBase::Open(Templ, PageSize, Pages, PagesOrBytes)))
+ ythrow yexception() << ErrorMessage(ret, Name + " sorter: open error, temp dir", Templ);
+ }
+
+ void Sort(bool direct = false) {
+ int ret = TBase::Sort(Memory, 1000, direct);
+ if (ret)
+ ythrow yexception() << ErrorMessage(ret, Name + " sorter: sort error, temp dir", Templ, TVal::RecordSig);
+ }
+
+ void SortToFile(const TString& name) {
+ int ret = TBase::SortToFile(name.data(), Memory);
+ if (ret)
+ ythrow yexception() << ErrorMessage(ret, Name + "sorter: error in SortToFile", name, TVal::RecordSig);
+ }
+
+ void SortToStream(TAutoPtr<IOutputStream> output) {
+ int ret = TBase::SortToStream(output, Memory);
+ if (ret)
+ ythrow yexception() << ErrorMessage(ret, Name + "sorter: error in SortToStream", "", TVal::RecordSig);
+ }
+
+ void Close() {
+ int ret1 = TBase::GetError();
+ int ret2 = TBase::Close();
+ if (Templ[0]) {
+ *strrchr(Templ, GetDirectorySeparator()) = 0;
+ RemoveDirWithContents(Templ);
+ Templ[0] = 0;
+ }
+ if (ret1)
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret1, Name + "sorter: error before closing");
+ if (ret2)
+ if (!std::uncaught_exception())
+ ythrow yexception() << ErrorMessage(ret2, Name + "sorter: error while closing");
+ }
+
+ int Sort(size_t memory, int maxportions, bool direct = false) {
+ return TBase::Sort(memory, maxportions, direct);
+ }
+
+ const char* GetName() const {
+ return Name.data();
+ }
+
+ using TBase::GetPageSize;
+ using TBase::GetPages;
+ using TBase::Next;
+ using TBase::NextPortion;
+ using TBase::Push;
+ using TBase::PushWithExtInfo;
+ using TBase::UseSegmentSorter;
+
+protected:
+ TString Name;
+ size_t Memory, PageSize, Pages;
+ int PagesOrBytes;
+ char Templ[FILENAME_MAX];
+};
+
+template <typename TSorter>
+class TSorterArray {
+public:
+ typedef TSorter TDatSorter;
+
+public:
+ TSorterArray(const TString& name, size_t memory, size_t pagesize, size_t pages, int pagesOrBytes = 1)
+ : Name(name)
+ , Memory(memory)
+ , PageSize(pagesize)
+ , Pages(pages)
+ , PagesOrBytes(pagesOrBytes)
+ , NumSorters(0)
+ , Sorters(nullptr)
+ {
+ }
+
+ ~TSorterArray() {
+ for (int i = 0; i < NumSorters; ++i) {
+ Sorters[i].Close();
+ Sorters[i].~TSorter();
+ }
+ free(Sorters);
+ Sorters = nullptr;
+ NumSorters = 0;
+ }
+
+ TSorter& operator[](size_t pos) {
+ return Sorters[pos];
+ }
+
+ void Open(int n, const TString& fname, size_t memory = 0) {
+ if (!(Sorters = (TSorter*)malloc(n * sizeof(TSorter))))
+ ythrow yexception() << "can't alloc \"" << fname << "\" sorter array: " << LastSystemErrorText();
+ NumSorters = n;
+ char temp[FILENAME_MAX];
+ if (memory)
+ Memory = memory;
+ for (int i = 0; i < NumSorters; ++i) {
+ sprintf(temp, "%s[%d]", Name.data(), i);
+ new (Sorters + i) TSorter(temp, Memory, PageSize, Pages, PagesOrBytes);
+ }
+ for (int i = 0; i < NumSorters; ++i)
+ Sorters[i].Open(fname);
+ }
+
+ void Close() {
+ for (int i = 0; i < NumSorters; ++i)
+ Sorters[i].Close();
+ }
+
+ const char* GetName() const {
+ return Name.data();
+ }
+
+protected:
+ TString Name;
+ size_t Memory, PageSize, Pages;
+ int PagesOrBytes, NumSorters;
+ TSorter* Sorters;
+};
+
+template <typename TVal, template <typename T> class TCompare, typename TSieve = TFakeSieve<TVal>>
+class TDatSorterArray: public TSorterArray<TDatSorter<TVal, TCompare, TSieve>> {
+public:
+ TDatSorterArray(const char* name, size_t memory, size_t pagesize, size_t pages, int pagesOrBytes = 1)
+ : TSorterArray<TDatSorter<TVal, TCompare, TSieve>>(name, memory, pagesize, pages, pagesOrBytes)
+ {
+ }
+};
+
+template <typename TVal, template <typename T> class TCompare, typename TCompress = TFakeCompression,
+ typename TSieve = TFakeSieve<TVal>, typename TPageFile = TOutputPageFile, typename TFileTypes = TDefInterFileTypes>
+class TDatSorterMemo: public TDatSorter<TVal, TCompare, TCompress, TSieve, TPageFile, TFileTypes> {
+ typedef TDatSorter<TVal, TCompare, TCompress, TSieve, TPageFile, TFileTypes> TSorter;
+
+public:
+ TOutDatFile<TVal> Memo;
+ TString Home;
+ bool OpenReq;
+ bool Opened;
+ bool UseDirectWrite;
+
+public:
+ TDatSorterMemo(const char* name, size_t memory, size_t pagesize, size_t pages, int pagesOrBytes = 1)
+ : TSorter(name, memory, pagesize, pages, pagesOrBytes)
+ , Memo(name, pagesize, memory, 0)
+ {
+ OpenReq = false;
+ Opened = false;
+ UseDirectWrite = false;
+ }
+
+ void Open(const TString& home) {
+ OpenReq = true;
+ // TSorter::Open(home);
+ Home = home;
+ Memo.Open(nullptr);
+ Memo.Freeze();
+ }
+
+ void Reopen(const char* home) {
+ Close();
+ Open(home);
+ }
+
+ void Open() {
+ if (!OpenReq) {
+ OpenReq = true;
+ Memo.Open(nullptr);
+ Memo.Freeze();
+ }
+ }
+
+ void OpenIfNeeded() {
+ if (OpenReq && !Opened) {
+ if (!Home)
+ ythrow yexception() << "Temp directory not specified, call Open(char*) first : " << TSorter::Name;
+ TSorter::Open(Home);
+ Opened = true;
+ }
+ }
+
+ TVal* Reserve(size_t len) {
+ if (TExtInfoType<TVal>::Exists)
+ return ReserveWithExt(len, 0);
+
+ TVal* u = Memo.Reserve(len);
+ if (!u) {
+ OpenIfNeeded();
+ TSorter::NextPortion(UseDirectWrite);
+ Memo.Freeze();
+ u = Memo.Reserve(len);
+ }
+ TSorter::PushWithExtInfo(u);
+ return u;
+ }
+
+ TVal* ReserveWithExt(size_t len, size_t extSize) {
+ size_t fullLen = len + len_long((i64)extSize) + extSize;
+ TVal* u = Memo.Reserve(fullLen);
+ if (!u) {
+ OpenIfNeeded();
+ TSorter::NextPortion(UseDirectWrite);
+ Memo.Freeze();
+ u = Memo.Reserve(fullLen);
+ if (!u) {
+ if (fullLen > Memo.GetPageSize()) {
+ ythrow yexception() << "Size of element and " << len << " size of extInfo " << extSize
+ << " is larger than page size " << Memo.GetPageSize();
+ }
+ ythrow yexception() << "going to insert a null pointer. Bad.";
+ }
+ }
+ out_long((i64)extSize, (char*)u + len);
+ TSorter::PushWithExtInfo(u);
+ return u;
+ }
+
+ char* GetReservedExt(TVal* rec, size_t len, size_t extSize) {
+ return (char*)rec + len + len_long((i64)extSize);
+ }
+
+ const TVal* Push(const TVal* v, const typename TExtInfoType<TVal>::TResult* extInfo = nullptr) {
+ const TVal* u = Memo.Push(v, extInfo);
+ if (!u) {
+ OpenIfNeeded();
+ TSorter::NextPortion(UseDirectWrite);
+ Memo.Freeze();
+ u = Memo.Push(v, extInfo);
+ if (!u) {
+ if (SizeOf(v) > Memo.GetPageSize()) {
+ ythrow yexception() << "Size of element " << SizeOf(v)
+ << " is larger than page size " << Memo.GetPageSize();
+ }
+ ythrow yexception() << "going to insert a null pointer. Bad.";
+ }
+ }
+ TSorter::PushWithExtInfo(u);
+ return u;
+ }
+
+ const TVal* Push(const TVal* v, const ui8* extInfoRaw, size_t extLen) {
+ const TVal* u = Memo.Push(v, extInfoRaw, extLen);
+ if (!u) {
+ OpenIfNeeded();
+ TSorter::NextPortion(UseDirectWrite);
+ Memo.Freeze();
+ u = Memo.Push(v, extInfoRaw, extLen);
+ if (!u) {
+ if (SizeOf(v) > Memo.GetPageSize()) {
+ ythrow yexception() << "Size of element " << SizeOf(v)
+ << " is larger than page size " << Memo.GetPageSize();
+ }
+ ythrow yexception() << "going to insert a null pointer. Bad..";
+ }
+ }
+ TSorter::PushWithExtInfo(u);
+ return u;
+ }
+
+ const TVal* PushWithExtInfo(const TVal* v) {
+ const TVal* u = Memo.PushWithExtInfo(v);
+ if (!u) {
+ OpenIfNeeded();
+ TSorter::NextPortion(UseDirectWrite);
+ Memo.Freeze();
+ u = Memo.PushWithExtInfo(v);
+ if (!u) {
+ if (SizeOf(v) > Memo.GetPageSize()) {
+ ythrow yexception() << "Size of element " << SizeOf(v)
+ << " is larger than page size " << Memo.GetPageSize();
+ }
+ ythrow yexception() << "going to insert a null pointer. Bad...";
+ }
+ }
+ TSorter::PushWithExtInfo(u);
+ return u;
+ }
+
+ void Sort(bool direct = false) {
+ if (Opened) {
+ TSorter::NextPortion(UseDirectWrite);
+ Memo.Close();
+ OpenReq = false;
+ TSorter::Sort(direct);
+ } else {
+ TSorter::SortPortion();
+ }
+ }
+
+ const TVal* Next() {
+ return Opened ? TSorter::Next() : TSorter::Nextp();
+ }
+
+ bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const {
+ return NMicroBDB::GetExtInfo(Current(), extInfo);
+ }
+
+ const ui8* GetExtInfoRaw(size_t* len) const {
+ return NMicroBDB::GetExtInfoRaw(Current(), len);
+ }
+
+ const TVal* Current() const {
+ return Opened ? TSorter::Current() : TSorter::Currentp();
+ }
+
+ int NextPortion() {
+ OpenIfNeeded();
+ return TSorter::NextPortion(UseDirectWrite);
+ }
+
+ void SortToFile(const char* name) {
+ OpenIfNeeded();
+ TSorter::NextPortion(UseDirectWrite);
+ Memo.Close();
+ OpenReq = false;
+ TSorter::SortToFile(name);
+ }
+
+ void SortToStream(TAutoPtr<IOutputStream> output) {
+ OpenIfNeeded();
+ TSorter::NextPortion(UseDirectWrite);
+ Memo.Close();
+ OpenReq = false;
+ TSorter::SortToStream(output);
+ }
+
+ template <typename TKey, typename TOutCompress>
+ void SortToDirectFile(const char* name, size_t ipagesize, size_t ipages) {
+ Sort();
+ TOutDirectFile<TVal, TKey, TOutCompress> out(TSorter::Name, TSorter::PageSize, TSorter::Pages, ipagesize, ipages, TSorter::PagesOrBytes);
+ out.Open(name);
+ while (const TVal* rec = Next())
+ out.PushWithExtInfo(rec);
+ out.Close();
+ }
+
+ template <typename TKey>
+ void SortToDirectFile(const char* name, size_t ipagesize, size_t ipages) {
+ SortToDirectFile<TKey, TCompress>(name, ipagesize, ipages);
+ }
+
+ void CloseSorter() {
+ if (Opened)
+ TSorter::Close();
+ else
+ TSorter::Closep();
+ Memo.Freeze();
+ Opened = false;
+ }
+
+ void Close() {
+ if (Opened)
+ TSorter::Close();
+ else
+ TSorter::Closep();
+ Memo.Close();
+ OpenReq = false;
+ Opened = false;
+ }
+
+ int SavePortions(const char* mask) {
+ return TSorter::SavePortions(mask, UseDirectWrite);
+ }
+
+public:
+ using TSorter::RestorePortions;
+};
+
+template <typename TVal, template <typename T> class TCompare, typename TCompress = TFakeCompression,
+ typename TSieve = TFakeSieve<TVal>, class TPageFile = TOutputPageFile, class TFileTypes = TDefInterFileTypes>
+class TDatSorterMemoArray: public TSorterArray<TDatSorterMemo<TVal, TCompare, TCompress, TSieve, TPageFile, TFileTypes>> {
+public:
+ typedef TSorterArray<TDatSorterMemo<TVal, TCompare, TCompress, TSieve, TPageFile, TFileTypes>> TBase;
+
+ TDatSorterMemoArray(const char* name, size_t memory, size_t pagesize, size_t pages, int pagesOrBytes = 1)
+ : TBase(name, memory, pagesize, pages, pagesOrBytes)
+ {
+ }
+};
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/library/cpp/microbdb/sorter.h b/library/cpp/microbdb/sorter.h
new file mode 100644
index 0000000000..b2e7390377
--- /dev/null
+++ b/library/cpp/microbdb/sorter.h
@@ -0,0 +1,677 @@
+#pragma once
+
+#include <util/ysaveload.h>
+#include <util/generic/algorithm.h>
+#include <contrib/libs/libc_compat/include/link/link.h>
+
+#include "header.h"
+#include "heap.h"
+#include "extinfo.h"
+#include "input.h"
+#include "output.h"
+
+#ifdef TEST_MERGE
+#define MBDB_SORT_FUN ::StableSort
+#else
+#define MBDB_SORT_FUN ::Sort
+#endif
+
+template <class TVal, class TCompare, typename TCompress, typename TSieve, typename TOutPageFile, typename TFileTypes>
+class TDatSorterImpl;
+
+template <class TVal>
+struct TFakeSieve {
+ static inline int Sieve(TVal*, const TVal*) noexcept {
+ return 0;
+ }
+};
+
+template <class TSieve>
+struct TIsSieveFake {
+ static const bool Result = false;
+};
+
+template <class T>
+struct TIsSieveFake<TFakeSieve<T>> {
+ static const bool Result = true;
+};
+
+class TDefInterFileTypes {
+public:
+ typedef TOutputPageFile TOutPageFile;
+ typedef TInputPageFile TInPageFile;
+};
+
+//class TCompressedInterFileTypes;
+
+template <class TVal, class TCompare, typename TCompress, typename TSieve, typename TOutPageFile = TOutputPageFile, typename TFileTypes = TDefInterFileTypes>
+class TDatSorterImplBase: protected THeapIter<TVal, TInDatFileImpl<TVal, TInputRecordIterator<TVal, TInputPageIterator<typename TFileTypes::TInPageFile>>>, TCompare> {
+ typedef TOutputRecordIterator<TVal, TOutputPageIterator<typename TFileTypes::TOutPageFile>, TFakeIndexer, TCompress> TTmpRecIter;
+ typedef TInputRecordIterator<TVal, TInputPageIterator<typename TFileTypes::TInPageFile>> TInTmpRecIter;
+
+public:
+ typedef TOutDatFileImpl<TVal, TTmpRecIter> TTmpOut;
+ typedef TInDatFileImpl<TVal, TInTmpRecIter> TTmpIn;
+
+ typedef TOutDatFileImpl<TVal, TOutputRecordIterator<TVal, TOutputPageIterator<TOutPageFile>, TFakeIndexer, TCompress>> TOut;
+ typedef THeapIter<TVal, TTmpIn, TCompare> TMyHeap;
+ typedef TVector<const TVal*> TMyVector;
+ typedef typename TMyVector::iterator TMyIterator;
+
+ class IPortionSorter {
+ public:
+ virtual ~IPortionSorter() {
+ }
+
+ virtual void Sort(TMyVector&, TTmpOut*) = 0;
+ };
+
+ class TDefaultSorter: public IPortionSorter {
+ public:
+ void Sort(TMyVector& vector, TTmpOut* out) override {
+ MBDB_SORT_FUN(vector.begin(), vector.end(), TCompare());
+
+ const typename TMyVector::const_iterator
+ end = (TIsSieveFake<TSieve>::Result) ? vector.end() : TDatSorterImplBase::SieveRange(vector.begin(), vector.end());
+
+ for (typename TMyVector::const_iterator it = vector.begin(); it != end; ++it) {
+ out->PushWithExtInfo(*it);
+ }
+ }
+ };
+
+ class TSegmentedSorter: public IPortionSorter {
+ class TAdaptor {
+ typedef typename TMyVector::const_iterator TConstIterator;
+
+ public:
+ TAdaptor(TConstIterator b, TConstIterator e)
+ : Curr_(b)
+ , End_(e)
+ {
+ --Curr_;
+ }
+
+ inline const TVal* Current() const {
+ return *Curr_;
+ }
+
+ inline const TVal* Next() {
+ ++Curr_;
+
+ if (Curr_ == End_) {
+ return nullptr;
+ }
+
+ return *Curr_;
+ }
+
+ private:
+ TConstIterator Curr_;
+ TConstIterator End_;
+ };
+
+ typedef THeapIter<TVal, TAdaptor, TCompare> TPortionsHeap;
+
+ public:
+ void Sort(TMyVector& vector, TTmpOut* out) override {
+ TVector<TAdaptor> bounds;
+ typename TMyVector::iterator
+ it = vector.begin();
+ const size_t portions = Max<size_t>(1, (vector.size() * sizeof(TVal)) / (4 << 20));
+ const size_t step = vector.size() / portions;
+
+ // Sort segments
+ while (it != vector.end()) {
+ const typename TMyVector::iterator
+ end = Min(it + step, vector.end());
+
+ MBDB_SORT_FUN(it, end, TCompare());
+
+ bounds.push_back(TAdaptor(it, end));
+
+ it = end;
+ }
+
+ //
+ // Merge result
+ //
+
+ TPortionsHeap heap(bounds);
+
+ if (TIsSieveFake<TSieve>::Result) {
+ while (const TVal* val = heap.Next()) {
+ out->PushWithExtInfo(val);
+ }
+ } else {
+ const TVal* val = heap.Next();
+ const TVal* prev = out->PushWithExtInfo(val);
+
+ for (val = heap.Next(); val && prev; val = heap.Next()) {
+ if (TSieve::Sieve((TVal*)prev, val)) {
+ continue;
+ }
+
+ prev = out->PushWithExtInfo(val);
+ }
+
+ if (prev) {
+ TSieve::Sieve((TVal*)prev, prev);
+ }
+ }
+ }
+ };
+
+public:
+ TDatSorterImplBase()
+ : Sorter(new TDefaultSorter)
+ {
+ InFiles = nullptr;
+ TempBuf = nullptr;
+ Ptr = Vector.end();
+ Cur = nullptr;
+ Portions = CPortions = Error = 0;
+ }
+
+ ~TDatSorterImplBase() {
+ Close();
+ }
+
+ int Open(const char* templ, size_t pagesize, size_t pages, int pagesOrBytes = 1) {
+ Portions = CPortions = Error = 0;
+ TempBuf = strdup(templ);
+ Pagesize = pagesize;
+ if (pagesOrBytes)
+ Pages = pages;
+ else
+ Pages = pages / pagesize;
+ Pages = Max(1, Pages);
+ return 0;
+ }
+
+ void Push(const TVal* v) {
+ // Serialized extInfo must follow a record being pushed, therefore, to avoid
+ // unintentional misusage (as if when you are adding TExtInfo in your record
+ // type: you may forget to check your sorting routines and get a segfault as
+ // a result).
+ // PushWithExtInfo(v) should be called on records with extInfo.
+ static_assert(!TExtInfoType<TVal>::Exists, "expect !TExtInfoType<TVal>::Exists");
+
+ Vector.push_back(v);
+ }
+
+ void PushWithExtInfo(const TVal* v) {
+ Vector.push_back(v);
+ }
+
+ int SortPortion() {
+ Ptr = Vector.end();
+ Cur = nullptr;
+ if (!Vector.size() || Error)
+ return Error;
+
+ MBDB_SORT_FUN(Vector.begin(), Vector.end(), TCompare());
+
+ if (!TIsSieveFake<TSieve>::Result) {
+ const typename TMyVector::iterator
+ end = SieveRange(Vector.begin(), Vector.end());
+
+ Vector.resize(end - Vector.begin());
+ }
+
+ Ptr = Vector.begin();
+ Cur = nullptr;
+ return 0;
+ }
+
+ const TVal* Nextp() {
+ Cur = Ptr == Vector.end() ? nullptr : *Ptr++;
+ return Cur;
+ }
+
+ const TVal* Currentp() const {
+ return Cur;
+ }
+
+ void Closep() {
+ Vector.clear();
+ Ptr = Vector.end();
+ Cur = nullptr;
+ }
+
+ int NextPortion(bool direct = false) {
+ if (!Vector.size() || Error)
+ return Error;
+
+ TTmpOut out;
+ int ret, ret1;
+ char fname[FILENAME_MAX];
+
+ snprintf(fname, sizeof(fname), TempBuf, Portions++);
+ if ((ret = out.Open(fname, Pagesize, Pages, 1, direct)))
+ return Error = ret;
+
+ Sorter->Sort(Vector, &out);
+
+ Vector.erase(Vector.begin(), Vector.end());
+ ret = out.GetError();
+ ret1 = out.Close();
+ Error = Error ? Error : ret ? ret : ret1;
+ if (Error)
+ unlink(fname);
+ return Error;
+ }
+
+ int SavePortions(const char* mask, bool direct = false) {
+ char srcname[PATH_MAX], dstname[PATH_MAX];
+ if (Vector.size())
+ NextPortion(direct);
+ for (int i = 0; i < Portions; i++) {
+ char num[10];
+ sprintf(num, "%i", i);
+ snprintf(srcname, sizeof(srcname), TempBuf, i);
+ snprintf(dstname, sizeof(dstname), mask, num);
+ int res = rename(srcname, dstname);
+ if (res)
+ return res;
+ }
+ snprintf(dstname, sizeof(dstname), mask, "count");
+ TOFStream fcount(dstname);
+ Save(&fcount, Portions);
+ fcount.Finish();
+ return 0;
+ }
+
+ int RestorePortions(const char* mask) {
+ char srcname[PATH_MAX], dstname[PATH_MAX];
+ snprintf(srcname, sizeof(srcname), mask, "count");
+ TIFStream fcount(srcname);
+ Load(&fcount, Portions);
+ for (int i = 0; i < Portions; i++) {
+ char num[10];
+ sprintf(num, "%i", i);
+ snprintf(dstname, sizeof(dstname), TempBuf, i);
+ snprintf(srcname, sizeof(srcname), mask, num);
+ unlink(dstname);
+ int res = link(srcname, dstname);
+ if (res)
+ return res;
+ }
+ return 0;
+ }
+
+ int RestorePortions(const char* mask, ui32 count) {
+ char srcname[PATH_MAX], dstname[PATH_MAX];
+ ui32 portions;
+ TVector<ui32> counts;
+ for (ui32 j = 0; j < count; j++) {
+ snprintf(srcname, sizeof(srcname), mask, j, "count");
+ TIFStream fcount(srcname);
+ Load(&fcount, portions);
+ counts.push_back(portions);
+ Portions += portions;
+ }
+ ui32 p = 0;
+ for (ui32 j = 0; j < count; j++) {
+ int cnt = counts[j];
+ for (int i = 0; i < cnt; i++, p++) {
+ char num[10];
+ sprintf(num, "%i", i);
+ snprintf(dstname, sizeof(dstname), TempBuf, p);
+ snprintf(srcname, sizeof(srcname), mask, j, num);
+ unlink(dstname);
+ int res = link(srcname, dstname);
+ if (res) {
+ fprintf(stderr, "Can not link %s to %s\n", srcname, dstname);
+ return res;
+ }
+ }
+ }
+ return 0;
+ }
+
+ int Sort(size_t memory, int maxportions = 1000, bool direct = false) {
+ int ret, end, beg, i;
+ char fname[FILENAME_MAX];
+
+ if (Vector.size())
+ NextPortion();
+
+ if (Error)
+ return Error;
+ if (!Portions) {
+ TMyHeap::Init(&DummyFile, 1); // closed file
+ HPages = 1;
+ return 0;
+ }
+
+ Optimize(memory, maxportions);
+ if (!(InFiles = new TTmpIn[MPortions]))
+ return MBDB_NO_MEMORY;
+
+ for (beg = 0; beg < Portions && !Error; beg = end) {
+ end = (int)Min(beg + FPortions, Portions);
+ for (i = beg; i < end && !Error; i++) {
+ snprintf(fname, sizeof(fname), TempBuf, i);
+ if ((ret = InFiles[i - beg].Open(fname, HPages, 1, nullptr, direct)))
+ Error = Error ? Error : ret;
+ }
+ if (Error)
+ return Error;
+ TMyHeap::Init(InFiles, end - beg);
+ if (end != Portions) {
+ TTmpOut out;
+ const TVal* v;
+ snprintf(fname, sizeof(fname), TempBuf, Portions++);
+ if ((ret = out.Open(fname, Pagesize, HPages)))
+ return Error = Error ? Error : ret;
+ while ((v = TMyHeap::Next()))
+ out.PushWithExtInfo(v);
+ ret = out.GetError();
+ Error = Error ? Error : ret;
+ ret = out.Close();
+ Error = Error ? Error : ret;
+ for (i = beg; i < end; i++) {
+ ret = InFiles[i - beg].Close();
+ Error = Error ? Error : ret;
+ snprintf(fname, sizeof(fname), TempBuf, CPortions++);
+ unlink(fname);
+ }
+ }
+ FPortions = MPortions;
+ }
+ return Error;
+ }
+
+ int Close() {
+ char fname[FILENAME_MAX];
+ delete[] InFiles;
+ InFiles = nullptr;
+ Closep();
+ for (int i = CPortions; i < Portions; i++) {
+ snprintf(fname, sizeof(fname), TempBuf, i);
+ unlink(fname);
+ }
+ CPortions = Portions = 0;
+ free(TempBuf);
+ TempBuf = nullptr;
+ return Error;
+ }
+
+ void UseSegmentSorter() {
+ Sorter.Reset(new TSegmentedSorter);
+ }
+
+ inline int GetError() const {
+ return Error;
+ }
+
+ inline int GetPages() const {
+ return Pages;
+ }
+
+ inline int GetPageSize() const {
+ return Pagesize;
+ }
+
+private:
+ static TMyIterator SieveRange(const TMyIterator begin, const TMyIterator end) {
+ TMyIterator it = begin;
+ TMyIterator prev = begin;
+
+ for (++it; it != end; ++it) {
+ if (TSieve::Sieve((TVal*)*prev, *it)) {
+ continue;
+ }
+
+ ++prev;
+
+ if (it != prev) {
+ *prev = *it;
+ }
+ }
+
+ TSieve::Sieve((TVal*)*prev, *prev);
+
+ return ++prev;
+ }
+
+protected:
+ void Optimize(size_t memory, int maxportions, size_t fbufmax = 256u << 20) {
+ maxportions = (int)Min((size_t)maxportions, memory / Pagesize) - 1;
+ size_t maxpages = Max((size_t)1u, fbufmax / Pagesize);
+
+ if (maxportions <= 2) {
+ FPortions = MPortions = 2;
+ HPages = 1;
+ return;
+ }
+ if (maxportions >= Portions) {
+ FPortions = MPortions = Portions;
+ HPages = (int)Min(memory / ((Portions + 1) * Pagesize), maxpages);
+ return;
+ }
+ if (((Portions + maxportions - 1) / maxportions) <= maxportions) {
+ while (((Portions + maxportions - 1) / maxportions) <= maxportions)
+ --maxportions;
+ MPortions = ++maxportions;
+ int total = ((Portions + maxportions - 1) / maxportions) + Portions;
+ FPortions = (total % maxportions) ? (total % maxportions) : MPortions;
+ HPages = (int)Min(memory / ((MPortions + 1) * Pagesize), maxpages);
+ return;
+ }
+ FPortions = MPortions = maxportions;
+ HPages = (int)Min(memory / ((MPortions + 1) * Pagesize), maxpages);
+ }
+
+ TMyVector Vector;
+ typename TMyVector::iterator Ptr;
+ const TVal* Cur;
+ TTmpIn *InFiles, DummyFile;
+ char* TempBuf;
+ int Portions, CPortions, Pagesize, Pages, Error;
+ int FPortions, MPortions, HPages;
+ THolder<IPortionSorter> Sorter;
+};
+
+template <class TVal, class TCompare, typename TCompress>
+class TDatSorterImpl<TVal, TCompare, TCompress, TFakeSieve<TVal>, TOutputPageFile, TDefInterFileTypes>
+ : public TDatSorterImplBase<TVal, TCompare, TCompress, TFakeSieve<TVal>, TOutputPageFile, TDefInterFileTypes> {
+ typedef TDatSorterImplBase<TVal, TCompare, TCompress, TFakeSieve<TVal>, TOutputPageFile, TDefInterFileTypes> TBase;
+
+public:
+ int SortToFile(const char* name, size_t memory, int maxportions = 1000) {
+ int ret = TBase::Sort(memory, maxportions);
+ if (ret)
+ return ret;
+ typename TBase::TOut out;
+ if ((ret = out.Open(name, TBase::Pagesize, TBase::HPages)))
+ return ret;
+ const TVal* rec;
+ while ((rec = Next()))
+ out.PushWithExtInfo(rec);
+ if ((ret = out.GetError()))
+ return ret;
+ if ((ret = out.Close()))
+ return ret;
+ if ((ret = TBase::Close()))
+ return ret;
+ return 0;
+ }
+
+ int SortToStream(TAutoPtr<IOutputStream> output, size_t memory, int maxportions = 1000) {
+ int ret = TBase::Sort(memory, maxportions);
+ if (ret)
+ return ret;
+ typename TBase::TOut out;
+ if ((ret = out.Open(output, TBase::Pagesize, TBase::HPages)))
+ return ret;
+ const TVal* rec;
+ while ((rec = Next()))
+ out.PushWithExtInfo(rec);
+ if ((ret = out.GetError()))
+ return ret;
+ if ((ret = out.Close()))
+ return ret;
+ if ((ret = TBase::Close()))
+ return ret;
+ return 0;
+ }
+
+ const TVal* Next() {
+ return TBase::TMyHeap::Next();
+ }
+
+ const TVal* Current() const {
+ return TBase::TMyHeap::Current();
+ }
+
+ bool GetExtInfo(typename TExtInfoType<TVal>::TResult* extInfo) const {
+ return TBase::TMyHeap::GetExtInfo(extInfo);
+ }
+
+ const ui8* GetExtInfoRaw(size_t* len) const {
+ return TBase::TMyHeap::GetExtInfoRaw(len);
+ }
+};
+
+template <class TVal, class TCompare, typename TCompress, typename TSieve,
+ typename TOutPageFile = TOutputPageFile, typename TFileTypes = TDefInterFileTypes>
+class TDatSorterImpl: public TDatSorterImplBase<TVal, TCompare, TCompress, TSieve, TOutPageFile, TFileTypes> {
+ typedef TDatSorterImplBase<TVal, TCompare, TCompress, TSieve, TOutPageFile, TFileTypes> TBase;
+
+public:
+ TDatSorterImpl()
+ : Cur(nullptr)
+ , Prev(nullptr)
+ {
+ }
+
+ int SortToFile(const char* name, size_t memory, int maxportions = 1000) {
+ int ret = Sort(memory, maxportions);
+ if (ret)
+ return ret;
+ typename TBase::TOut out;
+ if ((ret = out.Open(name, TBase::Pagesize, TBase::HPages)))
+ return ret;
+ const TVal* rec;
+ while ((rec = Next()))
+ out.PushWithExtInfo(rec);
+ if ((ret = out.GetError()))
+ return ret;
+ if ((ret = out.Close()))
+ return ret;
+ if ((ret = TBase::Close()))
+ return ret;
+ return 0;
+ }
+
+ int SortToStream(TAutoPtr<IOutputStream> output, size_t memory, int maxportions = 1000) {
+ int ret = Sort(memory, maxportions);
+ if (ret)
+ return ret;
+ typename TBase::TOut out;
+ if ((ret = out.Open(output, TBase::Pagesize, TBase::HPages)))
+ return ret;
+ const TVal* rec;
+ while ((rec = Next()))
+ out.PushWithExtInfo(rec);
+ if ((ret = out.GetError()))
+ return ret;
+ if ((ret = out.Close()))
+ return ret;
+ if ((ret = TBase::Close()))
+ return ret;
+ return 0;
+ }
+
+ int Open(const char* templ, size_t pagesize, size_t pages, int pagesOrBytes = 1) {
+ int res = TBase::Open(templ, pagesize, pages, pagesOrBytes);
+ Prev = nullptr;
+ Cur = nullptr;
+ return res;
+ }
+
+ int Sort(size_t memory, int maxportions = 1000, bool direct = false) {
+ int res = TBase::Sort(memory, maxportions, direct);
+ if (!res) {
+ const TVal* rec = TBase::TMyHeap::Next();
+ if (rec) {
+ size_t els, es;
+ size_t sz = NMicroBDB::SizeOfExt(rec, &els, &es);
+ sz += els + es;
+ if (!TExtInfoType<TVal>::Exists)
+ Cur = (TVal*)malloc(sizeof(TVal));
+ else
+ Cur = (TVal*)malloc(TBase::Pagesize);
+ memcpy(Cur, rec, sz);
+ }
+ }
+ return res;
+ }
+
+ // Prev = last returned
+ // Cur = current accumlating with TSieve
+
+ const TVal* Next() {
+ if (!Cur) {
+ if (Prev) {
+ free(Prev);
+ Prev = nullptr;
+ }
+ return nullptr;
+ }
+ const TVal* rec;
+
+ if (TIsSieveFake<TSieve>::Result)
+ rec = TBase::TMyHeap::Next();
+ else {
+ do {
+ rec = TBase::TMyHeap::Next();
+ } while (rec && TSieve::Sieve((TVal*)Cur, rec));
+ }
+
+ if (!Prev) {
+ if (!TExtInfoType<TVal>::Exists)
+ Prev = (TVal*)malloc(sizeof(TVal));
+ else
+ Prev = (TVal*)malloc(TBase::Pagesize);
+ }
+ size_t els, es;
+ size_t sz = NMicroBDB::SizeOfExt(Cur, &els, &es);
+ sz += els + es;
+ memcpy(Prev, Cur, sz);
+
+ if (rec) {
+ sz = NMicroBDB::SizeOfExt(rec, &els, &es);
+ sz += els + es;
+ memcpy(Cur, rec, sz);
+ } else {
+ TSieve::Sieve((TVal*)Cur, Cur);
+ free(Cur);
+ Cur = nullptr;
+ }
+ return Prev;
+ }
+
+ const TVal* Current() const {
+ return Prev;
+ }
+
+ int Close() {
+ int res = TBase::Close();
+ if (Prev) {
+ free(Prev);
+ Prev = nullptr;
+ }
+ if (Cur) {
+ free(Cur);
+ Cur = nullptr;
+ }
+ return res;
+ }
+
+protected:
+ TVal* Cur;
+ TVal* Prev;
+};
diff --git a/library/cpp/microbdb/sorterdef.h b/library/cpp/microbdb/sorterdef.h
new file mode 100644
index 0000000000..8834b5fff8
--- /dev/null
+++ b/library/cpp/microbdb/sorterdef.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#define MAKESORTERTMPL(TRecord, MemberFunc) \
+ template <typename T> \
+ struct MemberFunc; \
+ template <> \
+ struct MemberFunc<TRecord> { \
+ bool operator()(const TRecord* l, const TRecord* r) { \
+ return TRecord ::MemberFunc(l, r) < 0; \
+ } \
+ int operator()(const TRecord* l, const TRecord* r, int) { \
+ return TRecord ::MemberFunc(l, r); \
+ } \
+ }
+
+template <typename T>
+static inline int compare(const T& a, const T& b) {
+ return (a < b) ? -1 : (a > b);
+}
diff --git a/library/cpp/microbdb/utility.h b/library/cpp/microbdb/utility.h
new file mode 100644
index 0000000000..5c86061bca
--- /dev/null
+++ b/library/cpp/microbdb/utility.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include "microbdb.h"
+
+template <class TRecord, template <class T> class TCompare>
+int SortData(const TFile& ifile, const TFile& ofile, const TDatMetaPage* meta, size_t memory, const char* tmpDir = nullptr) {
+ char templ[FILENAME_MAX];
+ TInDatFileImpl<TRecord> datin;
+ TOutDatFileImpl<TRecord> datout;
+ TDatSorterImpl<TRecord, TCompare<TRecord>, TFakeCompression, TFakeSieve<TRecord>> sorter;
+ const TRecord* u;
+ int ret;
+
+ const size_t minMemory = (2u << 20);
+ memory = Max(memory, minMemory + minMemory / 2);
+ if (datin.Open(ifile, meta, memory - minMemory, 0))
+ err(1, "can't read input file");
+
+ size_t outpages = Max((size_t)2u, minMemory / datin.GetPageSize());
+ memory -= outpages * datin.GetPageSize();
+
+ if (ret = MakeSorterTempl(templ, tmpDir))
+ err(1, "can't create tempdir in \"%s\"; error: %d\n", templ, ret);
+
+ if (sorter.Open(templ, datin.GetPageSize(), outpages)) {
+ *strrchr(templ, LOCSLASH_C) = 0;
+ RemoveDirWithContents(templ);
+ err(1, "can't open sorter");
+ }
+
+ while (1) {
+ datin.Freeze();
+ while ((u = datin.Next()))
+ sorter.PushWithExtInfo(u);
+ sorter.NextPortion();
+ if (datin.GetError() || datin.IsEof())
+ break;
+ }
+
+ if (datin.GetError()) {
+ *strrchr(templ, LOCSLASH_C) = 0;
+ RemoveDirWithContents(templ);
+ err(1, "in data file error %d", datin.GetError());
+ }
+ if (datin.Close()) {
+ *strrchr(templ, LOCSLASH_C) = 0;
+ RemoveDirWithContents(templ);
+ err(1, "can't close in data file");
+ }
+
+ sorter.Sort(memory);
+
+ if (datout.Open(ofile, datin.GetPageSize(), outpages)) {
+ *strrchr(templ, LOCSLASH_C) = 0;
+ RemoveDirWithContents(templ);
+ err(1, "can't write out file");
+ }
+
+ while ((u = sorter.Next()))
+ datout.PushWithExtInfo(u);
+
+ if (sorter.GetError())
+ err(1, "sorter error %d", sorter.GetError());
+ if (sorter.Close())
+ err(1, "can't close sorter");
+
+ *strrchr(templ, LOCSLASH_C) = 0;
+ RemoveDirWithContents(templ);
+
+ if (datout.GetError())
+ err(1, "out data file error %d", datout.GetError());
+ if (datout.Close())
+ err(1, "can't close out data file");
+ return 0;
+}
diff --git a/library/cpp/microbdb/wrappers.h b/library/cpp/microbdb/wrappers.h
new file mode 100644
index 0000000000..38eb8edebc
--- /dev/null
+++ b/library/cpp/microbdb/wrappers.h
@@ -0,0 +1,637 @@
+#pragma once
+
+#include "microbdb.h"
+
+#define MAKEFILTERTMPL(TRecord, MemberFunc, NS) \
+ template <typename T> \
+ struct MemberFunc; \
+ template <> \
+ struct MemberFunc<TRecord> { \
+ bool operator()(const TRecord* r) { \
+ return NS::MemberFunc(r); \
+ } \
+ }
+
+#define MAKEJOINTMPL(TRecordA, TRecordB, MemberFunc, NS, TMergeType) \
+ template <typename A, typename B> \
+ struct MemberFunc; \
+ template <> \
+ struct MemberFunc<TRecordA, TRecordB> { \
+ int operator()(const TRecordA* l, const TRecordB* r) { \
+ return NS::MemberFunc(l, r); \
+ } \
+ }; \
+ typedef TMergeRec<TRecordA, TRecordB> TMergeType
+
+#define MAKEJOINTMPL2(TRecordA, TRecordB, MemberFunc, StructName, TMergeType) \
+ template <typename A, typename B> \
+ struct StructName; \
+ template <> \
+ struct StructName<TRecordA, TRecordB> { \
+ int operator()(const TRecordA* l, const TRecordB* r) { \
+ return MemberFunc(l, r); \
+ } \
+ }; \
+ typedef TMergeRec<TRecordA, TRecordB> TMergeType
+
+#define MAKEJOINTMPLLEFT(TRecordA, TRecordB, MemberFunc, NS, TMergeType) \
+ template <typename A, typename B> \
+ struct MemberFunc; \
+ template <> \
+ struct MemberFunc<TRecordA, TRecordB> { \
+ int operator()(const TRecordA* l, const TRecordB* r) { \
+ return NS::MemberFunc(l->RecA, r); \
+ } \
+ }; \
+ typedef TMergeRec<TRecordA, TRecordB> TMergeType
+
+template <class TRec>
+class IDatNextSource {
+public:
+ virtual const TRec* Next() = 0;
+ virtual void Work() {
+ }
+};
+
+template <class TRec>
+class IDatNextReceiver {
+public:
+ IDatNextReceiver(IDatNextSource<TRec>& source)
+ : Source(source)
+ {
+ }
+
+ virtual void Work() {
+ Source.Work();
+ }
+
+protected:
+ IDatNextSource<TRec>& Source;
+};
+
+template <class TInRec, class TOutRec>
+class IDatNextChannel: public IDatNextReceiver<TInRec>, public IDatNextSource<TOutRec> {
+public:
+ IDatNextChannel(IDatNextSource<TInRec>& source)
+ : IDatNextReceiver<TInRec>(source)
+ {
+ }
+
+ virtual void Work() {
+ IDatNextReceiver<TInRec>::Work();
+ }
+};
+
+class IDatWorker {
+public:
+ virtual void Work() = 0;
+};
+
+template <class TRec>
+class IDatPushReceiver {
+public:
+ virtual void Push(const TRec* rec) = 0;
+ virtual void Work() = 0;
+};
+
+template <class TRec>
+class IDatPushSource {
+public:
+ IDatPushSource(IDatPushReceiver<TRec>& receiver)
+ : Receiver(receiver)
+ {
+ }
+
+ virtual void Work() {
+ Receiver.Work();
+ }
+
+protected:
+ IDatPushReceiver<TRec>& Receiver;
+};
+
+template <class TInRec, class TOutRec>
+class IDatPushChannel: public IDatPushReceiver<TInRec>, public IDatPushSource<TOutRec> {
+public:
+ IDatPushChannel(IDatPushReceiver<TOutRec>& receiver)
+ : IDatPushSource<TOutRec>(receiver)
+ {
+ }
+
+ virtual void Work() {
+ IDatPushSource<TOutRec>::Work();
+ }
+};
+
+template <class TRec>
+class IDatNextToPush: public IDatNextReceiver<TRec>, public IDatPushSource<TRec> {
+ typedef IDatNextReceiver<TRec> TNextReceiver;
+ typedef IDatPushSource<TRec> TPushSource;
+
+public:
+ IDatNextToPush(IDatNextSource<TRec>& source, IDatPushReceiver<TRec>& receiver)
+ : TNextReceiver(source)
+ , TPushSource(receiver)
+ {
+ }
+
+ virtual void Work() {
+ const TRec* rec;
+ while (rec = TNextReceiver::Source.Next())
+ TPushSource::Receiver.Push(rec);
+ TPushSource::Work();
+ TNextReceiver::Work();
+ }
+};
+
+template <class TRec>
+class TDatNextPNSplitter: public IDatNextReceiver<TRec>, public IDatNextSource<TRec>, public IDatPushSource<TRec> {
+public:
+ TDatNextPNSplitter(IDatNextSource<TRec>& source, IDatPushReceiver<TRec>& receiver)
+ : IDatNextReceiver<TRec>(source)
+ , IDatNextSource<TRec>()
+ , IDatPushSource<TRec>(receiver)
+ {
+ }
+
+ const TRec* Next() {
+ const TRec* rec = IDatNextReceiver<TRec>::Source.Next();
+ if (rec) {
+ IDatPushSource<TRec>::Receiver.Push(rec);
+ return rec;
+ } else {
+ return 0;
+ }
+ }
+
+ virtual void Work() {
+ IDatNextReceiver<TRec>::Work();
+ IDatPushSource<TRec>::Work();
+ }
+};
+
+template <class TRec, class TOutRecA = TRec, class TOutRecB = TRec>
+class TDatPushPPSplitter: public IDatPushReceiver<TRec>, public IDatPushSource<TOutRecA>, public IDatPushSource<TOutRecB> {
+public:
+ TDatPushPPSplitter(IDatPushReceiver<TOutRecA>& receiverA, IDatPushReceiver<TOutRecB>& receiverB)
+ : IDatPushSource<TOutRecA>(receiverA)
+ , IDatPushSource<TOutRecB>(receiverB)
+ {
+ }
+
+ void Push(const TRec* rec) {
+ IDatPushSource<TOutRecA>::Receiver.Push(rec);
+ IDatPushSource<TOutRecB>::Receiver.Push(rec);
+ }
+
+ void Work() {
+ IDatPushSource<TOutRecA>::Work();
+ IDatPushSource<TOutRecB>::Work();
+ }
+};
+
+template <class TRec>
+class TFastInDatFile: public TInDatFile<TRec>, public IDatNextSource<TRec> {
+public:
+ typedef TInDatFile<TRec> Base;
+
+ TFastInDatFile(const char* name, bool open = true, size_t pages = dbcfg::fbufsize, int pagesOrBytes = 0)
+ : TInDatFile<TRec>(name, pages, pagesOrBytes)
+ , FileName(name)
+ {
+ if (open)
+ Base::Open(name);
+ }
+
+ void Open() {
+ Base::Open(FileName);
+ }
+
+ template <class TPassRec>
+ bool PassToUid(const TRec* inrec, const TPassRec* torec) {
+ inrec = Base::Current();
+ while (inrec && CompareUids(inrec, torec) < 0)
+ inrec = Base::Next();
+ return (inrec && CompareUids(inrec, torec) == 0);
+ }
+
+ void Work() {
+ Base::Close();
+ }
+
+ const TRec* Next() {
+ return Base::Next();
+ }
+
+private:
+ TString FileName;
+};
+
+template <class TRec>
+class TPushOutDatFile: public TOutDatFile<TRec>, public IDatPushReceiver<TRec> {
+public:
+ typedef TOutDatFile<TRec> Base;
+
+ TPushOutDatFile(const char* name, bool open = true)
+ : Base(name, dbcfg::pg_docuid, dbcfg::fbufsize, 0)
+ , FileName(name)
+ {
+ if (open)
+ Base::Open(name);
+ }
+
+ void Open() {
+ Base::Open(~FileName);
+ }
+
+ void Push(const TRec* rec) {
+ Base::Push(rec);
+ }
+
+ void Work() {
+ Base::Close();
+ }
+
+private:
+ TString FileName;
+};
+
+template <class TRec>
+class TNextOutDatFile: public IDatNextToPush<TRec> {
+public:
+ typedef IDatNextToPush<TRec> TBase;
+
+ TNextOutDatFile(const char* name, IDatNextSource<TRec>& source, bool open = true)
+ : TBase(source, File)
+ , File(name, open)
+ {
+ }
+
+ void Open() {
+ File.Open();
+ }
+
+private:
+ TPushOutDatFile<TRec> File;
+};
+
+template <class TVal, template <typename T> class TCompare>
+class TNextDatSorterMemo: public TDatSorterMemo<TVal, TCompare>, public IDatNextChannel<TVal, TVal> {
+ typedef TDatSorterMemo<TVal, TCompare> TImpl;
+
+public:
+ TNextDatSorterMemo(IDatNextSource<TVal>& source, const char* dir = dbcfg::fname_temp, const char* name = "yet another sorter", size_t memory = dbcfg::small_sorter_size, size_t pagesize = dbcfg::pg_docuid, size_t pages = dbcfg::fbufsize, int pagesOrBytes = 0)
+ : TImpl(name, memory, pagesize, pages, pagesOrBytes)
+ , IDatNextChannel<TVal, TVal>(source)
+ , Sorted(false)
+ {
+ TImpl::Open(dir);
+ }
+
+ void Sort() {
+ const TVal* rec;
+ while (rec = IDatNextChannel<TVal, TVal>::Source.Next()) {
+ TImpl::Push(rec);
+ }
+ TImpl::Sort();
+ Sorted = true;
+ }
+
+ const TVal* Next() {
+ if (!Sorted)
+ Sort();
+ return TImpl::Next();
+ }
+
+private:
+ bool Sorted;
+ TString Dir;
+};
+
+template <class TInRec, class TOutRec>
+class TDatConverter: public IDatNextChannel<TInRec, TOutRec> {
+public:
+ TDatConverter(IDatNextSource<TInRec>& source)
+ : IDatNextChannel<TInRec, TOutRec>(source)
+ {
+ }
+
+ virtual void Convert(const TInRec& inrec, TOutRec& outrec) {
+ outrec(inrec);
+ }
+
+ const TOutRec* Next() {
+ const TInRec* rec = IDatNextChannel<TInRec, TOutRec>::Source.Next();
+ if (!rec)
+ return 0;
+ Convert(*rec, CurrentRec);
+ return &CurrentRec;
+ }
+
+private:
+ TOutRec CurrentRec;
+};
+
+template <class TRecA, class TRecB>
+class TMergeRec {
+public:
+ const TRecA* RecA;
+ const TRecB* RecB;
+};
+
+enum NMergeTypes {
+ MT_JOIN = 0,
+ MT_ADD = 1,
+ MT_OVERWRITE = 2,
+ MT_TYPENUM
+};
+
+template <class TRecA, class TRecB, template <typename TA, typename TB> class TCompare>
+class TNextDatMerger: public IDatNextReceiver<TRecA>, public IDatNextReceiver<TRecB>, public IDatNextSource<TMergeRec<TRecA, TRecB>> {
+public:
+ TNextDatMerger(IDatNextSource<TRecA>& sourceA, IDatNextSource<TRecB>& sourceB, ui8 mergeType)
+ : IDatNextReceiver<TRecA>(sourceA)
+ , IDatNextReceiver<TRecB>(sourceB)
+ , MergeType(mergeType)
+ , MoveA(false)
+ , MoveB(false)
+ , NotInit(true)
+ {
+ }
+
+ const TMergeRec<TRecA, TRecB>* Next() {
+ if (MoveA || NotInit)
+ SourceARec = IDatNextReceiver<TRecA>::Source.Next();
+ if (MoveB || NotInit)
+ SourceBRec = IDatNextReceiver<TRecB>::Source.Next();
+ NotInit = false;
+
+ // Cout << "Next " << SourceARec->HostId << "\t" << SourceBRec->HostId << "\t" << TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) << "\t" << ::compare(SourceARec->HostId, SourceBRec->HostId) << "\t" << ::compare(1, 2) << "\t" << ::compare(2,1) << Endl;
+ if (MergeType == MT_ADD && SourceARec && (!SourceBRec || TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) < 0)) {
+ MergeRec.RecA = SourceARec;
+ MergeRec.RecB = 0;
+ MoveA = true;
+ MoveB = false;
+ return &MergeRec;
+ }
+
+ if (MergeType == MT_ADD && SourceBRec && (!SourceARec || TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) < 0)) {
+ MergeRec.RecA = 0;
+ MergeRec.RecB = SourceBRec;
+ MoveA = false;
+ MoveB = true;
+ return &MergeRec;
+ }
+
+ if (MergeType == MT_ADD && SourceARec && SourceBRec && TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) == 0) {
+ MergeRec.RecA = SourceARec;
+ MergeRec.RecB = SourceBRec;
+ MoveA = true;
+ MoveB = true;
+ return &MergeRec;
+ }
+
+ while (MergeType == MT_JOIN && SourceARec && SourceBRec && TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) != 0) {
+ while (SourceARec && TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) < 0) {
+ SourceARec = IDatNextReceiver<TRecA>::Source.Next();
+ }
+ while (SourceARec && SourceBRec && TCompare<TRecA, TRecB>()(SourceARec, SourceBRec) > 0) {
+ SourceBRec = IDatNextReceiver<TRecB>::Source.Next();
+ }
+ }
+
+ if (MergeType == MT_JOIN && SourceARec && SourceBRec) {
+ MergeRec.RecA = SourceARec;
+ MergeRec.RecB = SourceBRec;
+ MoveA = true;
+ MoveB = true;
+ return &MergeRec;
+ }
+
+ MergeRec.RecA = 0;
+ MergeRec.RecB = 0;
+ return 0;
+ }
+
+ void Work() {
+ IDatNextReceiver<TRecA>::Source.Work();
+ IDatNextReceiver<TRecB>::Source.Work();
+ }
+
+private:
+ TMergeRec<TRecA, TRecB> MergeRec;
+ const TRecA* SourceARec;
+ const TRecB* SourceBRec;
+ ui8 MergeType;
+ bool MoveA;
+ bool MoveB;
+ bool NotInit;
+};
+
+/*template<class TRec, class TSource, template <typename T> class TCompare, class TReceiver = TPushOutDatFile<TRec> >
+class TPushDatMerger {
+public:
+ TPushDatMerger(TSource& source, TReceiver& receiver, ui8 mergeType)
+ : Source(source)
+ , Receiver(receiver)
+ , MergeType(mergeType)
+ {
+ }
+
+ virtual void Init() {
+ SourceRec = Source.Next();
+ }
+
+ virtual void Push(const TRec* rec) {
+ while (SourceRec && TCompare<TRec>()(SourceRec, rec, 0) < 0) {
+ if (MergeType == MT_OVERWRITE || MergeType == MT_ADD)
+ Receiver.Push(SourceRec);
+ SourceRec = Source.Next();
+ }
+
+ bool intersected = false;
+ while (SourceRec && TCompare<TRec>()(SourceRec, rec, 0) == 0) {
+ intersected = true;
+ if (MergeType == MT_ADD)
+ Receiver.Push(SourceRec);
+ SourceRec = Source.Next();
+ }
+
+ if (intersected && MergeType == MT_JOIN)
+ Receiver.Push(rec);
+
+ if (MergeType == MT_OVERWRITE || MergeType == MT_ADD)
+ Receiver.Push(rec);
+ }
+
+ virtual void Term() {
+ if (MergeType == MT_OVERWRITE || MergeType == MT_ADD) {
+ while (SourceRec) {
+ Receiver.Push(SourceRec);
+ SourceRec = Source.Next();
+ }
+ }
+ }
+
+private:
+ TSource& Source;
+ const TRec* SourceRec;
+ TReceiver& Receiver;
+ ui8 MergeType;
+};*/
+
+/*template <class TRec, class TSourceA, class TSourceB, template <typename T> class TCompare, class TReceiver = TPushOutDatFile<TRec> >
+class TNextDatMerger: public TPushDatMerger<TRec, TSourceA, TCompare, TReceiver> {
+ typedef TPushDatMerger<TRec, TSourceA, TCompare, TReceiver> TImpl;
+public:
+ TNextDatMerger(TSourceA& sourceA, TSourceB& sourceB, TReceiver& receiver, ui8 mergeType)
+ : TImpl(sourceA, receiver, mergeType)
+ , SourceB(sourceB)
+ {
+ }
+
+ virtual void Work() {
+ TImpl::Init();
+ while (SourceBRec = SourceB.Next()) {
+ TImpl::Push(SourceBRec);
+ }
+ TImpl::Term();
+ }
+private:
+ TSourceB& SourceB;
+ const TRec* SourceBRec;
+};*/
+
+/*template <class TRec, template <typename T> class TCompare, class TReceiver = TPushOutDatFile<TRec> >
+class TFilePushDatMerger: public TPushDatMerger<TRec, TFastInDatFile<TRec>, TCompare, TReceiver> {
+ typedef TPushDatMerger<TRec, TFastInDatFile<TRec>, TCompare, TReceiver> TImpl;
+public:
+ TFilePushDatMerger(const char* name, TReceiver& receiver, ui8 mergeType)
+ : TImpl(SourceFile, receiver, mergeType)
+ , SourceFile(name)
+ {
+ }
+
+ virtual void Push(const TRec* rec) {
+ TImpl::Push(rec);
+ }
+
+ virtual void Term() {
+ TImpl::Term();
+ }
+private:
+ TFastInDatFile<TRec> SourceFile;
+};*/
+
+/*template <class TRec, template <typename T> class TCompare, class TReceiver = TPushOutDatFile<TRec> >
+class TFileNextDatMerger: public TNextDatMerger<TRec, TFastInDatFile<TRec>, TFastInDatFile<TRec>, TCompare, TReceiver> {
+ typedef TNextDatMerger<TRec, TFastInDatFile<TRec>, TFastInDatFile<TRec>, TCompare, TReceiver> TImpl;
+public:
+ TFileNextDatMerger(const char* sourceAname, const char* sourceBname, TReceiver& receiver, ui8 mergeType)
+ : TImpl(FileA, FileB, receiver, mergeType)
+ , FileA(sourceAname)
+ , FileB(sourceBname)
+ {
+ }
+
+ virtual void Work() {
+ TImpl::Work();
+ }
+private:
+ TFastInDatFile<TRec> FileA;
+ TFastInDatFile<TRec> FileB;
+};*/
+
+template <class TRec, template <typename T> class TPredicate>
+class TDatNextFilter: public IDatNextChannel<TRec, TRec> {
+public:
+ TDatNextFilter(IDatNextSource<TRec>& source)
+ : IDatNextChannel<TRec, TRec>(source)
+ {
+ }
+
+ virtual const TRec* Next() {
+ const TRec* rec;
+ while ((rec = IDatNextChannel<TRec, TRec>::Source.Next()) != 0 && !Check(rec)) {
+ }
+ if (!rec)
+ return 0;
+ return rec;
+ }
+
+protected:
+ virtual bool Check(const TRec* rec) {
+ return TPredicate<TRec>()(rec);
+ }
+};
+
+template <class TRec, template <typename T> class TPredicate>
+class TDatPushFilter: public IDatPushChannel<TRec, TRec> {
+public:
+ TDatPushFilter(IDatPushReceiver<TRec>& receiver)
+ : IDatPushChannel<TRec, TRec>(receiver)
+ {
+ }
+
+ virtual void Push(const TRec* rec) {
+ if (Check(rec))
+ IDatPushChannel<TRec, TRec>::Receiver.Push(rec);
+ }
+
+private:
+ virtual bool Check(const TRec* rec) {
+ return TPredicate<TRec>()(rec);
+ }
+};
+
+template <class TInRec, class TOutRec, template <typename T> class TCompare>
+class TDatGrouper: public IDatNextChannel<TInRec, TOutRec> {
+public:
+ TDatGrouper(IDatNextSource<TInRec>& source)
+ : IDatNextChannel<TInRec, TOutRec>(source)
+ , Begin(true)
+ , Finish(false)
+ , HasOutput(false)
+ {
+ }
+
+ const TOutRec* Next() {
+ while (CurrentRec = IDatNextChannel<TInRec, TOutRec>::Source.Next()) {
+ int cmp = 0;
+ if (Begin) {
+ Begin = false;
+ OnStart();
+ } else if ((cmp = TCompare<TInRec>()(CurrentRec, LastRec, 0)) != 0) {
+ OnFinish();
+ OnStart();
+ }
+ OnRecord();
+ LastRec = CurrentRec;
+ if (HasOutput) {
+ HasOutput = false;
+ return &OutRec;
+ }
+ }
+ if (!Finish)
+ OnFinish();
+ Finish = true;
+ if (HasOutput) {
+ HasOutput = false;
+ return &OutRec;
+ }
+ return 0;
+ }
+
+protected:
+ virtual void OnStart() = 0;
+ virtual void OnRecord() = 0;
+ virtual void OnFinish() = 0;
+
+ const TInRec* CurrentRec;
+ const TInRec* LastRec;
+ TOutRec OutRec;
+
+ bool Begin;
+ bool Finish;
+ bool HasOutput;
+};
diff --git a/library/cpp/microbdb/ya.make b/library/cpp/microbdb/ya.make
new file mode 100644
index 0000000000..3e553f8535
--- /dev/null
+++ b/library/cpp/microbdb/ya.make
@@ -0,0 +1,36 @@
+LIBRARY()
+
+SRCS(
+ align.h
+ compressed.h
+ extinfo.h
+ file.cpp
+ hashes.h
+ header.h
+ header.cpp
+ heap.h
+ input.h
+ microbdb.cpp
+ noextinfo.proto
+ output.h
+ powersorter.h
+ reader.h
+ safeopen.h
+ sorter.h
+ sorterdef.h
+ utility.h
+ wrappers.h
+)
+
+PEERDIR(
+ contrib/libs/fastlz
+ contrib/libs/libc_compat
+ contrib/libs/protobuf
+ contrib/libs/snappy
+ contrib/libs/zlib
+ library/cpp/deprecated/fgood
+ library/cpp/on_disk/st_hash
+ library/cpp/packedtypes
+)
+
+END()
diff --git a/library/cpp/on_disk/CMakeLists.txt b/library/cpp/on_disk/CMakeLists.txt
index 4202947169..ade3b33c9a 100644
--- a/library/cpp/on_disk/CMakeLists.txt
+++ b/library/cpp/on_disk/CMakeLists.txt
@@ -7,3 +7,4 @@
add_subdirectory(chunks)
+add_subdirectory(st_hash)
diff --git a/library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt b/library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..ad332fef62
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,18 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-on_disk-st_hash)
+target_link_libraries(cpp-on_disk-st_hash PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-deprecated-mapped_file
+)
+target_sources(cpp-on_disk-st_hash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp
+)
diff --git a/library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt b/library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..737875ca6c
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-on_disk-st_hash)
+target_link_libraries(cpp-on_disk-st_hash PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ cpp-deprecated-mapped_file
+)
+target_sources(cpp-on_disk-st_hash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp
+)
diff --git a/library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt b/library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..737875ca6c
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-on_disk-st_hash)
+target_link_libraries(cpp-on_disk-st_hash PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ cpp-deprecated-mapped_file
+)
+target_sources(cpp-on_disk-st_hash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp
+)
diff --git a/library/cpp/on_disk/st_hash/CMakeLists.txt b/library/cpp/on_disk/st_hash/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt b/library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..ad332fef62
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,18 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-on_disk-st_hash)
+target_link_libraries(cpp-on_disk-st_hash PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-deprecated-mapped_file
+)
+target_sources(cpp-on_disk-st_hash PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/on_disk/st_hash/fake.cpp
+)
diff --git a/library/cpp/on_disk/st_hash/fake.cpp b/library/cpp/on_disk/st_hash/fake.cpp
new file mode 100644
index 0000000000..ef5af4d432
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/fake.cpp
@@ -0,0 +1,4 @@
+#include "save_stl.h"
+#include "static_hash.h"
+#include "static_hash_map.h"
+#include "sthash_iterators.h"
diff --git a/library/cpp/on_disk/st_hash/save_stl.h b/library/cpp/on_disk/st_hash/save_stl.h
new file mode 100644
index 0000000000..00f8f0e20d
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/save_stl.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <util/generic/hash.h>
+#include <util/system/yassert.h>
+#include <util/stream/output.h>
+
+// this structure might be replaced with sthashtable class
+template <class HF, class Eq, class size_type>
+struct sthashtable_nvm_sv {
+ sthashtable_nvm_sv() {
+ if (sizeof(sthashtable_nvm_sv) != sizeof(HF) + sizeof(Eq) + 3 * sizeof(size_type)) {
+ memset(this, 0, sizeof(sthashtable_nvm_sv));
+ }
+ }
+
+ sthashtable_nvm_sv(const HF& phf, const Eq& peq, const size_type& pnb, const size_type& pne, const size_type& pnd)
+ : sthashtable_nvm_sv()
+ {
+ hf = phf;
+ eq = peq;
+ num_buckets = pnb;
+ num_elements = pne;
+ data_end_off = pnd;
+ }
+
+ HF hf;
+ Eq eq;
+ size_type num_buckets;
+ size_type num_elements;
+ size_type data_end_off;
+};
+
+/**
+ * Some hack to save both THashMap and sthash.
+ * Working with stHash does not depend on the template parameters, because the content of stHash is not used inside this method.
+ */
+template <class V, class K, class HF, class Ex, class Eq, class A>
+template <class KeySaver>
+inline int THashTable<V, K, HF, Ex, Eq, A>::save_for_st(IOutputStream* stream, KeySaver& ks, sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* stHash) const {
+ Y_ASSERT(!stHash || stHash->bucket_count() == bucket_count());
+ typedef sthashtable_nvm_sv<HF, Eq, typename KeySaver::TSizeType> sv_type;
+ sv_type sv = {this->_get_hash_fun(), this->_get_key_eq(), static_cast<typename KeySaver::TSizeType>(buckets.size()), static_cast<typename KeySaver::TSizeType>(num_elements), 0};
+ // to do: m.b. use just the size of corresponding object?
+ typename KeySaver::TSizeType cur_off = sizeof(sv_type) +
+ (sv.num_buckets + 1) * sizeof(typename KeySaver::TSizeType);
+ sv.data_end_off = cur_off;
+ const_iterator n;
+ for (n = begin(); n != end(); ++n) {
+ sv.data_end_off += static_cast<typename KeySaver::TSizeType>(ks.GetRecordSize(*n));
+ }
+ typename KeySaver::TSizeType* sb = stHash ? (typename KeySaver::TSizeType*)(stHash->buckets()) : nullptr;
+ if (stHash)
+ sv.data_end_off += static_cast<typename KeySaver::TSizeType>(sb[buckets.size()] - sb[0]);
+ //saver.Align(sizeof(char*));
+ stream->Write(&sv, sizeof(sv));
+
+ size_type i;
+ //save vector
+ for (i = 0; i < buckets.size(); ++i) {
+ node* cur = buckets[i];
+ stream->Write(&cur_off, sizeof(cur_off));
+ if (cur) {
+ while (!((uintptr_t)cur & 1)) {
+ cur_off += static_cast<typename KeySaver::TSizeType>(ks.GetRecordSize(cur->val));
+ cur = cur->next;
+ }
+ }
+ if (stHash)
+ cur_off += static_cast<typename KeySaver::TSizeType>(sb[i + 1] - sb[i]);
+ }
+ stream->Write(&cur_off, sizeof(cur_off)); // end mark
+ for (i = 0; i < buckets.size(); ++i) {
+ node* cur = buckets[i];
+ if (cur) {
+ while (!((uintptr_t)cur & 1)) {
+ ks.SaveRecord(stream, cur->val);
+ cur = cur->next;
+ }
+ }
+ if (stHash)
+ stream->Write((const char*)stHash + sb[i], sb[i + 1] - sb[i]);
+ }
+ return 0;
+}
diff --git a/library/cpp/on_disk/st_hash/static_hash.h b/library/cpp/on_disk/st_hash/static_hash.h
new file mode 100644
index 0000000000..ca7a6ccd36
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/static_hash.h
@@ -0,0 +1,420 @@
+#pragma once
+
+#include "save_stl.h"
+#include "sthash_iterators.h"
+
+#include <util/generic/hash.h>
+#include <util/generic/vector.h>
+#include <util/generic/buffer.h>
+#include <util/generic/cast.h>
+#include <util/generic/yexception.h> // for save/load only
+#include <util/stream/file.h>
+#include <util/stream/buffer.h>
+#include <utility>
+
+#include <memory>
+#include <algorithm>
+#include <functional>
+
+#include <cstdlib>
+#include <cstddef>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4624) // 'destructor could not be generated because a base class destructor is inaccessible'
+#endif
+
+template <class HashType, class KeySaver>
+inline void SaveHashToStreamEx(HashType& hash, IOutputStream* stream) {
+ KeySaver ks;
+ if (hash.save_for_st(stream, ks))
+ ythrow yexception() << "Could not save hash to stream";
+}
+
+template <class HashType>
+inline void SaveHashToStream(HashType& hash, IOutputStream* stream) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver;
+ return SaveHashToStreamEx<HashType, KeySaver>(hash, stream);
+}
+
+template <class HashType, class KeySaver>
+inline void SaveHashToFileEx(HashType& hash, const char* fileName) {
+ TFileOutput output(fileName);
+ SaveHashToStreamEx<HashType, KeySaver>(hash, &output);
+}
+
+template <class HashType>
+inline void SaveHashToFile(HashType& hash, const char* fileName) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver;
+ return SaveHashToFileEx<HashType, KeySaver>(hash, fileName);
+}
+
+template <class HashType>
+inline void SaveHashSetToFile(HashType& hash, const char* fileName) {
+ typedef TSthashSetWriter<typename HashType::key_type, ui64> KeySaver;
+ return SaveHashToFileEx<HashType, KeySaver>(hash, fileName);
+}
+
+template <class HashType>
+inline void SaveHashToFile32(HashType& hash, const char* fileName) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui32> KeySaver;
+ return SaveHashToFileEx<HashType, KeySaver>(hash, fileName);
+}
+
+template <class HashType, class KeySaver>
+inline void SaveHashToBufferEx(HashType& hash, TBuffer& buffer, sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* stHash = nullptr) {
+ TBufferOutput stream(buffer);
+ KeySaver ks;
+ if (hash.save_for_st(&stream, ks, stHash))
+ ythrow yexception() << "Could not save hash to memory";
+}
+
+template <class HashType>
+inline void SaveHashToBuffer(HashType& hash, TBuffer& buffer) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver;
+ SaveHashToBufferEx<HashType, KeySaver>(hash, buffer);
+}
+
+/**
+ * Some hack to save both THashMap and sthash.
+ * THashMap and sthash must have same bucket_count().
+ */
+template <class HashType, class StHashType>
+inline void SaveHashToBuffer(HashType& hash, TBuffer& buffer, StHashType* stHash) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui64> KeySaver;
+ typedef sthash<int, int, THash<int>, TEqualTo<int>, typename KeySaver::TSizeType>* SH;
+
+ SH sh = reinterpret_cast<SH>(stHash);
+ SaveHashToBufferEx<HashType, KeySaver>(hash, buffer, sh);
+}
+
+template <class HashType>
+inline void SaveHashToBuffer32(HashType& hash, TBuffer& buffer) {
+ typedef TSthashWriter<typename HashType::key_type, typename HashType::mapped_type, ui32> KeySaver;
+ SaveHashToBufferEx<HashType, KeySaver>(hash, buffer);
+}
+
+template <class Iter, typename size_type_f = ui64>
+class sthashtable {
+public:
+ typedef typename Iter::TKeyType key_type;
+ typedef typename Iter::TValueType value_type;
+ typedef typename Iter::THasherType hasher;
+ typedef typename Iter::TKeyEqualType key_equal;
+
+ typedef size_type_f size_type;
+ typedef ptrdiff_t difference_type;
+ typedef const value_type* const_pointer;
+ typedef const value_type& const_reference;
+
+ typedef Iter const_iterator;
+
+ const hasher hash_funct() const {
+ return hash;
+ }
+ const key_equal key_eq() const {
+ return equals;
+ }
+
+private:
+ const hasher hash;
+ const key_equal equals;
+
+private:
+ const_iterator iter_at_bucket(size_type bucket) const {
+ return (const_iterator)(((char*)this + buckets()[bucket]));
+ }
+
+ const_iterator iter_at_bucket_or_end(size_type bucket) const {
+ if (bucket < num_buckets)
+ return (const_iterator)(((char*)this + buckets()[bucket]));
+ else
+ return end();
+ }
+
+ const size_type num_buckets;
+ const size_type num_elements;
+ const size_type data_end_off;
+
+protected: //shut up gcc warning
+ // we can't construct/destroy this object at all!
+ sthashtable();
+ sthashtable(const sthashtable& ht);
+ ~sthashtable();
+
+public:
+ // const size_type *buckets;
+ const size_type* buckets() const {
+ return (size_type*)((char*)this + sizeof(*this));
+ }
+ const size_type buckets(size_type n) const {
+ return buckets()[n];
+ }
+
+ size_type size() const {
+ return num_elements;
+ }
+ size_type max_size() const {
+ return size_type(-1);
+ }
+ bool empty() const {
+ return size() == 0;
+ }
+
+ const_iterator begin() const {
+ return num_buckets ? iter_at_bucket(0) : end();
+ }
+
+ const_iterator end() const {
+ return (const_iterator)(((char*)this + data_end_off));
+ }
+
+public:
+ size_type size_in_bytes() const {
+ return data_end_off;
+ }
+
+ size_type bucket_count() const {
+ return num_buckets;
+ }
+
+ size_type elems_in_bucket(size_type bucket) const {
+ size_type result = 0;
+ const_iterator first = iter_at_bucket(bucket);
+ const_iterator last = iter_at_bucket_or_end(bucket + 1);
+
+ for (; first != last; ++first)
+ ++result;
+ return result;
+ }
+
+ template <class TheKey>
+ const_iterator find(const TheKey& key) const {
+ size_type n = bkt_num_key(key);
+ const_iterator first(iter_at_bucket(n)), last(iter_at_bucket_or_end(n + 1));
+ for (;
+ first != last && !first.KeyEquals(equals, key);
+ ++first) {
+ }
+ if (first != last)
+ return first;
+ return end();
+ }
+
+ size_type count(const key_type& key) const {
+ const size_type n = bkt_num_key(key);
+ size_type result = 0;
+ const_iterator first = iter_at_bucket(n);
+ const_iterator last = iter_at_bucket_or_end(n + 1);
+
+ for (; first != last; ++first)
+ if (first.KeyEquals(equals, key))
+ ++result;
+ return result;
+ }
+
+ std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const;
+
+private:
+ template <class TheKey>
+ size_type bkt_num_key(const TheKey& key) const {
+ return hash(key) % num_buckets;
+ }
+};
+
+template <class I, class size_type_f>
+std::pair<I, I> sthashtable<I, size_type_f>::equal_range(const key_type& key) const {
+ typedef std::pair<const_iterator, const_iterator> pii;
+ const size_type n = bkt_num_key(key);
+ const_iterator first = iter_at_bucket(n);
+ const_iterator last = iter_at_bucket_or_end(n + 1);
+
+ for (; first != last; ++first) {
+ if (first.KeyEquals(equals, key)) {
+ const_iterator cur = first;
+ ++cur;
+ for (; cur != last; ++cur)
+ if (!cur.KeyEquals(equals, key))
+ return pii(const_iterator(first),
+ const_iterator(cur));
+ return pii(const_iterator(first),
+ const_iterator(last));
+ }
+ }
+ return pii(end(), end());
+}
+
+/* end __SGI_STL_HASHTABLE_H */
+
+template <class Key, class T, class HashFcn /*= hash<Key>*/,
+ class EqualKey = TEqualTo<Key>, typename size_type_f = ui64>
+class sthash {
+private:
+ typedef sthashtable<TSthashIterator<const Key, const T, HashFcn, EqualKey>, size_type_f> ht;
+ ht rep;
+
+public:
+ typedef typename ht::key_type key_type;
+ typedef typename ht::value_type value_type;
+ typedef typename ht::hasher hasher;
+ typedef typename ht::key_equal key_equal;
+ typedef T mapped_type;
+
+ typedef typename ht::size_type size_type;
+ typedef typename ht::difference_type difference_type;
+ typedef typename ht::const_pointer const_pointer;
+ typedef typename ht::const_reference const_reference;
+
+ typedef typename ht::const_iterator const_iterator;
+
+ const hasher hash_funct() const {
+ return rep.hash_funct();
+ }
+ const key_equal key_eq() const {
+ return rep.key_eq();
+ }
+
+public:
+ size_type size() const {
+ return rep.size();
+ }
+ size_type max_size() const {
+ return rep.max_size();
+ }
+ bool empty() const {
+ return rep.empty();
+ }
+
+ const_iterator begin() const {
+ return rep.begin();
+ }
+ const_iterator end() const {
+ return rep.end();
+ }
+
+public:
+ template <class TheKey>
+ const_iterator find(const TheKey& key) const {
+ return rep.find(key);
+ }
+ template <class TheKey>
+ bool has(const TheKey& key) const {
+ return rep.find(key) != rep.end();
+ }
+
+ size_type count(const key_type& key) const {
+ return rep.count(key);
+ }
+
+ std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const {
+ return rep.equal_range(key);
+ }
+
+ size_type size_in_bytes() const {
+ return rep.size_in_bytes();
+ }
+
+ size_type bucket_count() const {
+ return rep.bucket_count();
+ }
+ size_type max_bucket_count() const {
+ return rep.max_bucket_count();
+ }
+ size_type elems_in_bucket(size_type n) const {
+ return rep.elems_in_bucket(n);
+ }
+
+ const size_type* buckets() const {
+ return rep.buckets();
+ }
+ const size_type buckets(size_type n) const {
+ return rep.buckets()[n];
+ }
+};
+
+template <class Key, class HashFcn,
+ class EqualKey = TEqualTo<Key>, typename size_type_f = ui64>
+class sthash_set: public sthash<Key, TEmptyValue, HashFcn, EqualKey, size_type_f> {
+ typedef sthash<Key, TEmptyValue, HashFcn, EqualKey, size_type_f> Base;
+
+public:
+ using Base::const_iterator;
+ using Base::hasher;
+ using Base::key_equal;
+ using Base::key_type;
+ using Base::size_type;
+ using Base::value_type;
+};
+
+template <class Key, class T, class HashFcn /*= hash<Key>*/,
+ class EqualKey = TEqualTo<Key>, typename size_type_f = ui64>
+class sthash_mm {
+private:
+ typedef sthashtable<TSthashIterator<const Key, T, HashFcn, EqualKey>, size_type_f> ht;
+ ht rep;
+
+public:
+ typedef typename ht::key_type key_type;
+ typedef typename ht::value_type value_type;
+ typedef typename ht::hasher hasher;
+ typedef typename ht::key_equal key_equal;
+ typedef T mapped_type;
+
+ typedef typename ht::size_type size_type;
+ typedef typename ht::difference_type difference_type;
+ typedef typename ht::const_pointer const_pointer;
+ typedef typename ht::const_reference const_reference;
+
+ typedef typename ht::const_iterator const_iterator;
+
+ const hasher hash_funct() const {
+ return rep.hash_funct();
+ }
+ const key_equal key_eq() const {
+ return rep.key_eq();
+ }
+
+public:
+ size_type size() const {
+ return rep.size();
+ }
+ size_type max_size() const {
+ return rep.max_size();
+ }
+ bool empty() const {
+ return rep.empty();
+ }
+
+ const_iterator begin() const {
+ return rep.begin();
+ }
+ const_iterator end() const {
+ return rep.end();
+ }
+
+ const_iterator find(const key_type& key) const {
+ return rep.find(key);
+ }
+
+ size_type count(const key_type& key) const {
+ return rep.count(key);
+ }
+
+ std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const {
+ return rep.equal_range(key);
+ }
+
+ size_type bucket_count() const {
+ return rep.bucket_count();
+ }
+ size_type max_bucket_count() const {
+ return rep.max_bucket_count();
+ }
+ size_type elems_in_bucket(size_type n) const {
+ return rep.elems_in_bucket(n);
+ }
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
diff --git a/library/cpp/on_disk/st_hash/static_hash_map.h b/library/cpp/on_disk/st_hash/static_hash_map.h
new file mode 100644
index 0000000000..5dc50abd39
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/static_hash_map.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "static_hash.h"
+
+#include <library/cpp/deprecated/mapped_file/mapped_file.h>
+
+#include <util/system/filemap.h>
+
+template <class SH>
+struct sthash_mapped_c {
+ typedef SH H;
+ typedef typename H::const_iterator const_iterator;
+ TMappedFile M;
+ H* hsh;
+ sthash_mapped_c()
+ : M()
+ , hsh(nullptr)
+ {
+ }
+ sthash_mapped_c(const char* fname, bool precharge)
+ : M()
+ , hsh(nullptr)
+ {
+ Open(fname, precharge);
+ }
+ void Open(const char* fname, bool precharge) {
+ M.init(fname);
+ if (precharge)
+ M.precharge();
+ hsh = (H*)M.getData();
+ if (M.getSize() < sizeof(H) || (ssize_t)M.getSize() != hsh->end().Data - (char*)hsh)
+ ythrow yexception() << "Could not map hash: " << fname << " is damaged";
+ }
+ H* operator->() {
+ return hsh;
+ }
+ const H* operator->() const {
+ return hsh;
+ }
+ H* GetSthash() {
+ return hsh;
+ }
+ const H* GetSthash() const {
+ return hsh;
+ }
+};
+
+template <class Key, class T, class Hash>
+struct sthash_mapped: public sthash_mapped_c<sthash<Key, T, Hash>> {
+ typedef sthash<Key, T, Hash> H;
+ sthash_mapped(const char* fname, bool precharge)
+ : sthash_mapped_c<H>(fname, precharge)
+ {
+ }
+ sthash_mapped()
+ : sthash_mapped_c<H>()
+ {
+ }
+};
diff --git a/library/cpp/on_disk/st_hash/sthash_iterators.h b/library/cpp/on_disk/st_hash/sthash_iterators.h
new file mode 100644
index 0000000000..6a9ebdd6c3
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/sthash_iterators.h
@@ -0,0 +1,334 @@
+#pragma once
+
+#include "save_stl.h"
+
+#include <util/system/align.h>
+
+/**
+ This file provides functionality for saving some relatively simple THashMap object
+ to disk in a form that can be mapped read-only (via mmap) at any address.
+ That saved object is accessed via pointer to sthash object (that must have
+ the same parameters as original THashMap object)
+
+ If either key or value are variable-sized (i.e. contain pointers), user must
+ write his own instantiation of TSthashIterator (read iterator for sthash) and
+ TSthashWriter (write iterator for THashMap).
+ An example for <const char *, B> pair is in here.
+**/
+
+// TEmptyValue and SizeOfEx are helpers for sthash_set
+struct TEmptyValue {
+ TEmptyValue() = default;
+};
+
+template <class T>
+inline size_t SizeOfEx() {
+ return sizeof(T);
+}
+
+template <>
+inline size_t SizeOfEx<TEmptyValue>() {
+ return 0;
+}
+template <>
+inline size_t SizeOfEx<const TEmptyValue>() {
+ return 0;
+}
+
+template <class TKey, class TValue, class HashFcn, class EqualKey>
+struct TSthashIterator {
+ // Implementation for simple types
+ typedef const TKey TKeyType;
+ typedef const TValue TValueType;
+ typedef EqualKey TKeyEqualType;
+ typedef HashFcn THasherType;
+
+ const char* Data;
+ TSthashIterator()
+ : Data(nullptr)
+ {
+ }
+ explicit TSthashIterator(const char* data)
+ : Data(data)
+ {
+ }
+ void operator++() {
+ Data += GetLength();
+ }
+
+ bool operator!=(const TSthashIterator& that) const {
+ return Data != that.Data;
+ }
+ bool operator==(const TSthashIterator& that) const {
+ return Data == that.Data;
+ }
+ TKey& Key() const {
+ return *(TKey*)Data;
+ }
+ TValue& Value() {
+ return *(TValue*)(Data + sizeof(TKey));
+ }
+ const TValue& Value() const {
+ return *(const TValue*)(Data + sizeof(TKey));
+ }
+
+ template <class AnotherKeyType>
+ bool KeyEquals(const EqualKey& eq, const AnotherKeyType& key) const {
+ return eq(*(TKey*)Data, key);
+ }
+
+ size_t GetLength() const {
+ return sizeof(TKey) + SizeOfEx<TValue>();
+ }
+};
+
+template <class Key, class Value, typename size_type_o = ui64>
+struct TSthashWriter {
+ typedef size_type_o TSizeType;
+ size_t GetRecordSize(const std::pair<const Key, const Value>&) const {
+ return sizeof(Key) + SizeOfEx<Value>();
+ }
+ int SaveRecord(IOutputStream* stream, const std::pair<const Key, const Value>& record) const {
+ stream->Write(&record.first, sizeof(Key));
+ stream->Write(&record.second, SizeOfEx<Value>());
+ return 0;
+ }
+};
+
+// Remember that this simplified implementation makes a copy of `key' in std::make_pair.
+// It can also waste some memory on undesired alignment.
+template <class Key, typename size_type_o = ui64>
+struct TSthashSetWriter: public TSthashWriter<Key, TEmptyValue, size_type_o> {
+ typedef TSthashWriter<Key, TEmptyValue, size_type_o> MapWriter;
+ size_t GetRecordSize(const Key& key) const {
+ return MapWriter::GetRecordSize(std::make_pair(key, TEmptyValue()));
+ }
+ int SaveRecord(IOutputStream* stream, const Key& key) const {
+ return MapWriter::SaveRecord(stream, std::make_pair(key, TEmptyValue()));
+ }
+};
+
+// we can't save something with pointers without additional tricks
+
+template <class A, class B, class HashFcn, class EqualKey>
+struct TSthashIterator<A*, B, HashFcn, EqualKey> {};
+
+template <class A, class B, class HashFcn, class EqualKey>
+struct TSthashIterator<A, B*, HashFcn, EqualKey> {};
+
+template <class A, class B, typename size_type_o>
+struct TSthashWriter<A*, B*, size_type_o> {};
+
+template <class A, class B, typename size_type_o>
+struct TSthashWriter<A*, B, size_type_o> {};
+
+template <class A, class B, typename size_type_o>
+struct TSthashWriter<A, B*, size_type_o> {};
+
+template <class T>
+inline size_t AlignForChrKey() {
+ return 4; // TODO: change this (requeres rebuilt of a few existing files)
+}
+
+template <>
+inline size_t AlignForChrKey<TEmptyValue>() {
+ return 1;
+}
+
+template <>
+inline size_t AlignForChrKey<const TEmptyValue>() {
+ return AlignForChrKey<TEmptyValue>();
+}
+
+// !! note that for char*, physical placement of key and value is swapped
+template <class TValue, class HashFcn, class EqualKey>
+struct TSthashIterator<const char* const, TValue, HashFcn, EqualKey> {
+ typedef const TValue TValueType;
+ typedef const char* TKeyType;
+ typedef EqualKey TKeyEqualType;
+ typedef HashFcn THasherType;
+
+ const char* Data;
+ TSthashIterator()
+ : Data(nullptr)
+ {
+ }
+ TSthashIterator(const char* data)
+ : Data(data)
+ {
+ }
+ void operator++() {
+ Data += GetLength();
+ }
+
+ bool operator!=(const TSthashIterator& that) const {
+ return Data != that.Data;
+ }
+ bool operator==(const TSthashIterator& that) const {
+ return Data == that.Data;
+ }
+ const char* Key() const {
+ return Data + SizeOfEx<TValue>();
+ }
+ TValue& Value() {
+ return *(TValue*)Data;
+ }
+ const TValue& Value() const {
+ return *(const TValue*)Data;
+ }
+
+ template <class K>
+ bool KeyEquals(const EqualKey& eq, const K& k) const {
+ return eq(Data + SizeOfEx<TValue>(), k);
+ }
+
+ size_t GetLength() const {
+ size_t length = strlen(Data + SizeOfEx<TValue>()) + 1 + SizeOfEx<TValue>();
+ length = AlignUp(length, AlignForChrKey<TValue>());
+ return length;
+ }
+};
+
+template <class Value, typename size_type_o>
+struct TSthashWriter<const char*, Value, size_type_o> {
+ typedef size_type_o TSizeType;
+ size_t GetRecordSize(const std::pair<const char*, const Value>& record) const {
+ size_t length = strlen(record.first) + 1 + SizeOfEx<Value>();
+ length = AlignUp(length, AlignForChrKey<Value>());
+ return length;
+ }
+ int SaveRecord(IOutputStream* stream, const std::pair<const char*, const Value>& record) const {
+ const char* alignBuffer = "qqqq";
+ stream->Write(&record.second, SizeOfEx<Value>());
+ size_t length = strlen(record.first) + 1;
+ stream->Write(record.first, length);
+ length = AlignUpSpace(length, AlignForChrKey<Value>());
+ if (length)
+ stream->Write(alignBuffer, length);
+ return 0;
+ }
+};
+
+template <class TKey, class HashFcn, class EqualKey>
+struct TSthashIterator<TKey, const char* const, HashFcn, EqualKey> {
+ typedef const TKey TKeyType;
+ typedef const char* TValueType;
+ typedef EqualKey TKeyEqualType;
+ typedef HashFcn THasherType;
+
+ const char* Data;
+ TSthashIterator()
+ : Data(nullptr)
+ {
+ }
+ TSthashIterator(const char* data)
+ : Data(data)
+ {
+ }
+ void operator++() {
+ Data += GetLength();
+ }
+
+ bool operator!=(const TSthashIterator& that) const {
+ return Data != that.Data;
+ }
+ bool operator==(const TSthashIterator& that) const {
+ return Data == that.Data;
+ }
+ TKey& Key() {
+ return *(TKey*)Data;
+ }
+ const char* Value() const {
+ return Data + sizeof(TKey);
+ }
+
+ template <class K>
+ bool KeyEquals(const EqualKey& eq, const K& k) const {
+ return eq(*(TKey*)Data, k);
+ }
+
+ size_t GetLength() const {
+ size_t length = strlen(Data + sizeof(TKey)) + 1 + sizeof(TKey);
+ length = AlignUp(length, (size_t)4);
+ return length;
+ }
+};
+
+template <class Key, typename size_type_o>
+struct TSthashWriter<Key, const char*, size_type_o> {
+ typedef size_type_o TSizeType;
+ size_t GetRecordSize(const std::pair<const Key, const char*>& record) const {
+ size_t length = strlen(record.second) + 1 + sizeof(Key);
+ length = AlignUp(length, (size_t)4);
+ return length;
+ }
+ int SaveRecord(IOutputStream* stream, const std::pair<const Key, const char*>& record) const {
+ const char* alignBuffer = "qqqq";
+ stream->Write(&record.first, sizeof(Key));
+ size_t length = strlen(record.second) + 1;
+ stream->Write(record.second, length);
+ length = AlignUpSpace(length, (size_t)4);
+ if (length)
+ stream->Write(alignBuffer, length);
+ return 0;
+ }
+};
+
+template <class HashFcn, class EqualKey>
+struct TSthashIterator<const char* const, const char* const, HashFcn, EqualKey> {
+ typedef const char* TKeyType;
+ typedef const char* TValueType;
+ typedef EqualKey TKeyEqualType;
+ typedef HashFcn THasherType;
+
+ const char* Data;
+ TSthashIterator()
+ : Data(nullptr)
+ {
+ }
+ TSthashIterator(const char* data)
+ : Data(data)
+ {
+ }
+ void operator++() {
+ Data += GetLength();
+ }
+
+ bool operator!=(const TSthashIterator& that) const {
+ return Data != that.Data;
+ }
+ bool operator==(const TSthashIterator& that) const {
+ return Data == that.Data;
+ }
+ const char* Key() const {
+ return Data;
+ }
+ const char* Value() const {
+ return Data + strlen(Data) + 1;
+ }
+
+ template <class K>
+ bool KeyEquals(const EqualKey& eq, const K& k) const {
+ return eq(Data, k);
+ }
+
+ size_t GetLength() const {
+ size_t length = strlen(Data) + 1;
+ length += strlen(Data + length) + 1;
+ return length;
+ }
+};
+
+template <typename size_type_o>
+struct TSthashWriter<const char*, const char*, size_type_o> {
+ typedef size_type_o TSizeType;
+ size_t GetRecordSize(const std::pair<const char*, const char*>& record) const {
+ size_t size = strlen(record.first) + strlen(record.second) + 2;
+ return size;
+ }
+ int SaveRecord(IOutputStream* stream, const std::pair<const char*, const char*>& record) const {
+ stream->Write(record.first, strlen(record.first) + 1);
+ stream->Write(record.second, strlen(record.second) + 1);
+ return 0;
+ }
+};
diff --git a/library/cpp/on_disk/st_hash/ya.make b/library/cpp/on_disk/st_hash/ya.make
new file mode 100644
index 0000000000..8c6d05711c
--- /dev/null
+++ b/library/cpp/on_disk/st_hash/ya.make
@@ -0,0 +1,15 @@
+LIBRARY()
+
+SRCS(
+ fake.cpp
+ save_stl.h
+ static_hash.h
+ static_hash_map.h
+ sthash_iterators.h
+)
+
+PEERDIR(
+ library/cpp/deprecated/mapped_file
+)
+
+END()
diff --git a/library/cpp/regex/CMakeLists.darwin-x86_64.txt b/library/cpp/regex/CMakeLists.darwin-x86_64.txt
index 6e2a4fabcd..877d40538b 100644
--- a/library/cpp/regex/CMakeLists.darwin-x86_64.txt
+++ b/library/cpp/regex/CMakeLists.darwin-x86_64.txt
@@ -6,6 +6,7 @@
# original buildsystem will not be accepted.
+add_subdirectory(glob)
add_subdirectory(hyperscan)
add_subdirectory(pcre)
add_subdirectory(pire)
diff --git a/library/cpp/regex/CMakeLists.linux-aarch64.txt b/library/cpp/regex/CMakeLists.linux-aarch64.txt
index 279390306b..84c257a819 100644
--- a/library/cpp/regex/CMakeLists.linux-aarch64.txt
+++ b/library/cpp/regex/CMakeLists.linux-aarch64.txt
@@ -6,5 +6,6 @@
# original buildsystem will not be accepted.
+add_subdirectory(glob)
add_subdirectory(pcre)
add_subdirectory(pire)
diff --git a/library/cpp/regex/CMakeLists.linux-x86_64.txt b/library/cpp/regex/CMakeLists.linux-x86_64.txt
index 6e2a4fabcd..877d40538b 100644
--- a/library/cpp/regex/CMakeLists.linux-x86_64.txt
+++ b/library/cpp/regex/CMakeLists.linux-x86_64.txt
@@ -6,6 +6,7 @@
# original buildsystem will not be accepted.
+add_subdirectory(glob)
add_subdirectory(hyperscan)
add_subdirectory(pcre)
add_subdirectory(pire)
diff --git a/library/cpp/regex/CMakeLists.windows-x86_64.txt b/library/cpp/regex/CMakeLists.windows-x86_64.txt
index 6e2a4fabcd..877d40538b 100644
--- a/library/cpp/regex/CMakeLists.windows-x86_64.txt
+++ b/library/cpp/regex/CMakeLists.windows-x86_64.txt
@@ -6,6 +6,7 @@
# original buildsystem will not be accepted.
+add_subdirectory(glob)
add_subdirectory(hyperscan)
add_subdirectory(pcre)
add_subdirectory(pire)
diff --git a/library/cpp/regex/glob/CMakeLists.darwin-x86_64.txt b/library/cpp/regex/glob/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..ca8383e355
--- /dev/null
+++ b/library/cpp/regex/glob/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-regex-glob)
+target_link_libraries(cpp-regex-glob PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+)
+target_sources(cpp-regex-glob PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob_iterator.cpp
+)
diff --git a/library/cpp/regex/glob/CMakeLists.linux-aarch64.txt b/library/cpp/regex/glob/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..3953937c6d
--- /dev/null
+++ b/library/cpp/regex/glob/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,20 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-regex-glob)
+target_link_libraries(cpp-regex-glob PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+)
+target_sources(cpp-regex-glob PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob_iterator.cpp
+)
diff --git a/library/cpp/regex/glob/CMakeLists.linux-x86_64.txt b/library/cpp/regex/glob/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..3953937c6d
--- /dev/null
+++ b/library/cpp/regex/glob/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,20 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-regex-glob)
+target_link_libraries(cpp-regex-glob PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+)
+target_sources(cpp-regex-glob PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob_iterator.cpp
+)
diff --git a/library/cpp/regex/glob/CMakeLists.txt b/library/cpp/regex/glob/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/regex/glob/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/regex/glob/CMakeLists.windows-x86_64.txt b/library/cpp/regex/glob/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..ca8383e355
--- /dev/null
+++ b/library/cpp/regex/glob/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-regex-glob)
+target_link_libraries(cpp-regex-glob PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+)
+target_sources(cpp-regex-glob PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/regex/glob/glob_iterator.cpp
+)
diff --git a/library/cpp/regex/glob/glob.cpp b/library/cpp/regex/glob/glob.cpp
new file mode 100644
index 0000000000..9da058122a
--- /dev/null
+++ b/library/cpp/regex/glob/glob.cpp
@@ -0,0 +1,921 @@
+#define FROM_IMPLEMENTATION
+#include "glob_compat.h"
+
+#if defined(USE_INTERNAL_GLOB)
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Guido van Rossum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <library/cpp/charset/ci_string.h>
+#include <util/system/compat.h>
+#include <util/folder/dirut.h>
+
+/*
+ * glob(3) -- a superset of the one defined in POSIX 1003.2.
+ *
+ * The [!...] convention to negate a range is supported (SysV, Posix, ksh).
+ *
+ * Optional extra services, controlled by flags not defined by POSIX:
+ *
+ * GLOB_QUOTE:
+ * Escaping convention: \ inhibits any special meaning the following
+ * character might have (except \ at end of string is retained).
+ * GLOB_MAGCHAR:
+ * Set in gl_flags if pattern contained a globbing character.
+ * GLOB_NOMAGIC:
+ * Same as GLOB_NOCHECK, but it will only append pattern if it did
+ * not contain any magic characters. [Used in csh style globbing]
+ * GLOB_ALTDIRFUNC:
+ * Use alternately specified directory access functions.
+ * GLOB_TILDE:
+ * expand ~user/foo to the /home/dir/of/user/foo
+ * GLOB_BRACE:
+ * expand {1,2}{a,b} to 1a 1b 2a 2b
+ * gl_matchc:
+ * Number of matches in the current invocation of glob.
+ */
+
+/*
+ * Some notes on multibyte character support:
+ * 1. Patterns with illegal byte sequences match nothing - even if
+ * GLOB_NOCHECK is specified.
+ * 2. Illegal byte sequences in filenames are handled by treating them as
+ * single-byte characters with a value of the first byte of the sequence
+ * cast to wchar_t.
+ * 3. State-dependent encodings are not currently supported.
+ */
+
+//#include <sys/param.h>
+#include <sys/stat.h>
+
+#include <ctype.h>
+//#include <dirent.h>
+#include <errno.h>
+#include <limits.h>
+//#include <pwd.h>
+//#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if defined(_unix_)
+#include <unistd.h>
+#endif
+#include <wchar.h>
+
+#if !defined(_unix_)
+// silly replacement for compilation
+using uint_fast64_t = ui64;
+using u_int = unsigned int;
+using u_char = unsigned char;
+#define ARG_MAX 256
+#define S_ISDIR(x) ((x) & _S_IFDIR)
+#define S_ISLNK(x) 0
+#define lstat stat
+inline bool issetugid() { return false; }
+inline char *getlogin() { return 0; }
+inline int getuid() { return 0; }
+struct passwd {
+ char *pw_dir;
+};
+inline passwd *getpwuid(int) { return 0; }
+inline passwd *getpwnam(char *) { return 0; }
+#endif
+
+#define __collate_load_error 1
+inline int __collate_range_cmp(int, int) { return 0; }
+#undef COMMA // was defined in stroka.h
+// end silly replacement
+
+//#include "collate.h"
+
+#define DOLLAR '$'
+#define DOT '.'
+#define EOS '\0'
+#define LBRACKET '['
+#define NOT '!'
+#define QUESTION '?'
+#define QUOTE '\\'
+#define RANGE '-'
+#define RBRACKET ']'
+#define SEP '/'
+#define STAR '*'
+#define TILDE '~'
+#define UNDERSCORE '_'
+#define LBRACE '{'
+#define RBRACE '}'
+#define SLASH '/'
+#define COMMA ','
+
+#ifndef DEBUG
+
+#define M_QUOTE 0x8000000000ULL
+#define M_PROTECT 0x4000000000ULL
+#define M_MASK 0xffffffffffULL
+#define M_CHAR 0x00ffffffffULL
+
+using Char = uint_fast64_t;
+
+#else
+
+#define M_QUOTE 0x80
+#define M_PROTECT 0x40
+#define M_MASK 0xff
+#define M_CHAR 0x7f
+
+using Char = char;
+
+#endif
+
+
+#define CHAR(c) ((Char)((c)&M_CHAR))
+#define META(c) ((Char)((c)|M_QUOTE))
+#define M_ALL META('*')
+#define M_END META(']')
+#define M_NOT META('!')
+#define M_ONE META('?')
+#define M_RNG META('-')
+#define M_SET META('[')
+#define ismeta(c) (((c)&M_QUOTE) != 0)
+
+
+static int compare(const void *, const void *);
+static int g_Ctoc(const Char *, char *, u_int);
+static int g_lstat(Char *, struct stat *, glob_t *);
+static DIR *g_opendir(Char *, glob_t *);
+static Char *g_strchr(Char *, wchar_t);
+#ifdef notdef
+static Char *g_strcat(Char *, const Char *);
+#endif
+static int glob0(const Char *, glob_t *, int *);
+static int glob1(Char *, glob_t *, int *);
+static int glob2(Char *, Char *, Char *, Char *, glob_t *, int *);
+static int glob3(Char *, Char *, Char *, Char *, Char *, glob_t *, int *);
+static int globextend(const Char *, glob_t *, int *);
+static const Char *
+ globtilde(const Char *, Char *, size_t, glob_t *);
+static int globexp1(const Char *, glob_t *, int *);
+static int globexp2(const Char *, const Char *, glob_t *, int *, int *);
+static int match(Char *, Char *, Char *);
+#ifdef DEBUG
+static void qprintf(const char *, Char *);
+#endif
+
+int
+glob(const char *pattern, int flags, int (*errfunc)(const char *, int), glob_t *pglob)
+{
+ const u_char *patnext;
+ int limit;
+ Char *bufnext, *bufend, patbuf[MAXPATHLEN], prot;
+ mbstate_t mbs;
+ wchar_t wc;
+ size_t clen;
+
+ patnext = (u_char *) pattern;
+ if (!(flags & GLOB_APPEND)) {
+ pglob->gl_pathc = 0;
+ pglob->gl_pathv = NULL;
+ if (!(flags & GLOB_DOOFFS))
+ pglob->gl_offs = 0;
+ }
+ if (flags & GLOB_LIMIT) {
+ limit = pglob->gl_matchc;
+ if (limit == 0)
+ limit = ARG_MAX;
+ } else
+ limit = 0;
+ pglob->gl_flags = flags & ~GLOB_MAGCHAR;
+ pglob->gl_errfunc = errfunc;
+ pglob->gl_matchc = 0;
+
+ bufnext = patbuf;
+ bufend = bufnext + MAXPATHLEN - 1;
+ if (flags & GLOB_NOESCAPE) {
+ memset(&mbs, 0, sizeof(mbs));
+ while (bufend - bufnext >= MB_CUR_MAX) {
+ clen = mbrtowc(&wc, (const char*)patnext, MB_LEN_MAX, &mbs);
+ if (clen == (size_t)-1 || clen == (size_t)-2)
+ return (GLOB_NOMATCH);
+ else if (clen == 0)
+ break;
+ *bufnext++ = wc;
+ patnext += clen;
+ }
+ } else {
+ /* Protect the quoted characters. */
+ memset(&mbs, 0, sizeof(mbs));
+ while (bufend - bufnext >= MB_CUR_MAX) {
+ if (*patnext == QUOTE) {
+ if (*++patnext == EOS) {
+ *bufnext++ = QUOTE | M_PROTECT;
+ continue;
+ }
+ prot = M_PROTECT;
+ } else
+ prot = 0;
+ clen = mbrtowc(&wc, (const char*)patnext, MB_LEN_MAX, &mbs);
+ if (clen == (size_t)-1 || clen == (size_t)-2)
+ return (GLOB_NOMATCH);
+ else if (clen == 0)
+ break;
+ *bufnext++ = wc | prot;
+ patnext += clen;
+ }
+ }
+ *bufnext = EOS;
+
+ if (flags & GLOB_BRACE)
+ return globexp1(patbuf, pglob, &limit);
+ else
+ return glob0(patbuf, pglob, &limit);
+}
+
+/*
+ * Expand recursively a glob {} pattern. When there is no more expansion
+ * invoke the standard globbing routine to glob the rest of the magic
+ * characters
+ */
+static int
+globexp1(const Char *pattern, glob_t *pglob, int *limit)
+{
+ const Char* ptr = pattern;
+ int rv;
+
+ /* Protect a single {}, for find(1), like csh */
+ if (pattern[0] == LBRACE && pattern[1] == RBRACE && pattern[2] == EOS)
+ return glob0(pattern, pglob, limit);
+
+ while ((ptr = (const Char *) g_strchr((Char *) ptr, LBRACE)) != NULL)
+ if (!globexp2(ptr, pattern, pglob, &rv, limit))
+ return rv;
+
+ return glob0(pattern, pglob, limit);
+}
+
+
+/*
+ * Recursive brace globbing helper. Tries to expand a single brace.
+ * If it succeeds then it invokes globexp1 with the new pattern.
+ * If it fails then it tries to glob the rest of the pattern and returns.
+ */
+static int
+globexp2(const Char *ptr, const Char *pattern, glob_t *pglob, int *rv, int *limit)
+{
+ int i;
+ Char *lm, *ls;
+ const Char *pe, *pm, *pm1, *pl;
+ Char patbuf[MAXPATHLEN];
+
+ /* copy part up to the brace */
+ for (lm = patbuf, pm = pattern; pm != ptr; *lm++ = *pm++)
+ continue;
+ *lm = EOS;
+ ls = lm;
+
+ /* Find the balanced brace */
+ for (i = 0, pe = ++ptr; *pe; pe++)
+ if (*pe == LBRACKET) {
+ /* Ignore everything between [] */
+ for (pm = pe++; *pe != RBRACKET && *pe != EOS; pe++)
+ continue;
+ if (*pe == EOS) {
+ /*
+ * We could not find a matching RBRACKET.
+ * Ignore and just look for RBRACE
+ */
+ pe = pm;
+ }
+ }
+ else if (*pe == LBRACE)
+ i++;
+ else if (*pe == RBRACE) {
+ if (i == 0)
+ break;
+ i--;
+ }
+
+ /* Non matching braces; just glob the pattern */
+ if (i != 0 || *pe == EOS) {
+ *rv = glob0(patbuf, pglob, limit);
+ return 0;
+ }
+
+ for (i = 0, pl = pm = ptr; pm <= pe; pm++)
+ switch (*pm) {
+ case LBRACKET:
+ /* Ignore everything between [] */
+ for (pm1 = pm++; *pm != RBRACKET && *pm != EOS; pm++)
+ continue;
+ if (*pm == EOS) {
+ /*
+ * We could not find a matching RBRACKET.
+ * Ignore and just look for RBRACE
+ */
+ pm = pm1;
+ }
+ break;
+
+ case LBRACE:
+ i++;
+ break;
+
+ case RBRACE:
+ if (i) {
+ i--;
+ break;
+ }
+ [[fallthrough]];
+ case COMMA:
+ if (i && *pm == COMMA)
+ break;
+ else {
+ /* Append the current string */
+ for (lm = ls; (pl < pm); *lm++ = *pl++)
+ continue;
+ /*
+ * Append the rest of the pattern after the
+ * closing brace
+ */
+ for (pl = pe + 1; (*lm++ = *pl++) != EOS;)
+ continue;
+
+ /* Expand the current pattern */
+#ifdef DEBUG
+ qprintf("globexp2:", patbuf);
+#endif
+ *rv = globexp1(patbuf, pglob, limit);
+
+ /* move after the comma, to the next string */
+ pl = pm + 1;
+ }
+ break;
+
+ default:
+ break;
+ }
+ *rv = 0;
+ return 0;
+}
+
+
+
+/*
+ * expand tilde from the passwd file.
+ */
+static const Char *
+globtilde(const Char *pattern, Char *patbuf, size_t patbuf_len, glob_t *pglob)
+{
+ struct passwd *pwd;
+ char *h;
+ const Char *p;
+ Char *b, *eb;
+
+ if (*pattern != TILDE || !(pglob->gl_flags & GLOB_TILDE))
+ return pattern;
+
+ /*
+ * Copy up to the end of the string or /
+ */
+ eb = &patbuf[patbuf_len - 1];
+ for (p = pattern + 1, h = (char *) patbuf;
+ h < (char *)eb && *p && *p != SLASH; *h++ = (char)*p++)
+ continue;
+
+ *h = EOS;
+
+ if (((char *) patbuf)[0] == EOS) {
+ /*
+ * handle a plain ~ or ~/ by expanding $HOME first (iff
+ * we're not running setuid or setgid) and then trying
+ * the password file
+ */
+ if (issetugid() != 0 ||
+ (h = ::getenv("HOME")) == NULL) {
+ if (((h = getlogin()) != NULL &&
+ (pwd = getpwnam(h)) != NULL) ||
+ (pwd = getpwuid(getuid())) != NULL)
+ h = pwd->pw_dir;
+ else
+ return pattern;
+ }
+ }
+ else {
+ /*
+ * Expand a ~user
+ */
+ if ((pwd = getpwnam((char*) patbuf)) == NULL)
+ return pattern;
+ else
+ h = pwd->pw_dir;
+ }
+
+ /* Copy the home directory */
+ for (b = patbuf; b < eb && *h; *b++ = *h++)
+ continue;
+
+ /* Append the rest of the pattern */
+ while (b < eb && (*b++ = *p++) != EOS)
+ continue;
+ *b = EOS;
+
+ return patbuf;
+}
+
+
+/*
+ * The main glob() routine: compiles the pattern (optionally processing
+ * quotes), calls glob1() to do the real pattern matching, and finally
+ * sorts the list (unless unsorted operation is requested). Returns 0
+ * if things went well, nonzero if errors occurred.
+ */
+static int
+glob0(const Char *pattern, glob_t *pglob, int *limit)
+{
+ const Char *qpatnext;
+ int c, err, oldpathc;
+ Char *bufnext, patbuf[MAXPATHLEN];
+
+ qpatnext = globtilde(pattern, patbuf, MAXPATHLEN, pglob);
+ oldpathc = pglob->gl_pathc;
+ bufnext = patbuf;
+
+ /* We don't need to check for buffer overflow any more. */
+ while ((c = (char)*qpatnext++) != EOS) {
+ switch (c) {
+ case LBRACKET:
+ c = (char)*qpatnext;
+ if (c == NOT)
+ ++qpatnext;
+ if (*qpatnext == EOS ||
+ g_strchr((Char *) qpatnext+1, RBRACKET) == NULL) {
+ *bufnext++ = LBRACKET;
+ if (c == NOT)
+ --qpatnext;
+ break;
+ }
+ *bufnext++ = M_SET;
+ if (c == NOT)
+ *bufnext++ = M_NOT;
+ c = (char)*qpatnext++;
+ do {
+ *bufnext++ = CHAR(c);
+ if (*qpatnext == RANGE &&
+ (c = (char)qpatnext[1]) != RBRACKET) {
+ *bufnext++ = M_RNG;
+ *bufnext++ = CHAR(c);
+ qpatnext += 2;
+ }
+ } while ((c = (char)*qpatnext++) != RBRACKET);
+ pglob->gl_flags |= GLOB_MAGCHAR;
+ *bufnext++ = M_END;
+ break;
+ case QUESTION:
+ pglob->gl_flags |= GLOB_MAGCHAR;
+ *bufnext++ = M_ONE;
+ break;
+ case STAR:
+ pglob->gl_flags |= GLOB_MAGCHAR;
+ /* collapse adjacent stars to one,
+ * to avoid exponential behavior
+ */
+ if (bufnext == patbuf || bufnext[-1] != M_ALL)
+ *bufnext++ = M_ALL;
+ break;
+ default:
+ *bufnext++ = CHAR(c);
+ break;
+ }
+ }
+ *bufnext = EOS;
+#ifdef DEBUG
+ qprintf("glob0:", patbuf);
+#endif
+
+ if ((err = glob1(patbuf, pglob, limit)) != 0)
+ return(err);
+
+ /*
+ * If there was no match we are going to append the pattern
+ * if GLOB_NOCHECK was specified or if GLOB_NOMAGIC was specified
+ * and the pattern did not contain any magic characters
+ * GLOB_NOMAGIC is there just for compatibility with csh.
+ */
+ if (pglob->gl_pathc == oldpathc) {
+ if (((pglob->gl_flags & GLOB_NOCHECK) ||
+ ((pglob->gl_flags & GLOB_NOMAGIC) &&
+ !(pglob->gl_flags & GLOB_MAGCHAR))))
+ return(globextend(pattern, pglob, limit));
+ else
+ return(GLOB_NOMATCH);
+ }
+ if (!(pglob->gl_flags & GLOB_NOSORT))
+ qsort(pglob->gl_pathv + pglob->gl_offs + oldpathc,
+ pglob->gl_pathc - oldpathc, sizeof(char *), compare);
+ return(0);
+}
+
+static int
+compare(const void *p, const void *q)
+{
+ return(strcmp(*(char **)p, *(char **)q));
+}
+
+static int
+glob1(Char *pattern, glob_t *pglob, int *limit)
+{
+ Char pathbuf[MAXPATHLEN];
+
+ /* A null pathname is invalid -- POSIX 1003.1 sect. 2.4. */
+ if (*pattern == EOS)
+ return(0);
+ return(glob2(pathbuf, pathbuf, pathbuf + MAXPATHLEN - 1,
+ pattern, pglob, limit));
+}
+
+/*
+ * The functions glob2 and glob3 are mutually recursive; there is one level
+ * of recursion for each segment in the pattern that contains one or more
+ * meta characters.
+ */
+static int
+glob2(Char *pathbuf, Char *pathend, Char *pathend_last, Char *pattern, glob_t *pglob, int *limit)
+{
+ struct stat sb;
+ Char *p, *q;
+ int anymeta;
+
+ /*
+ * Loop over pattern segments until end of pattern or until
+ * segment with meta character found.
+ */
+ for (anymeta = 0;;) {
+ if (*pattern == EOS) { /* End of pattern? */
+ *pathend = EOS;
+ if (g_lstat(pathbuf, &sb, pglob))
+ return(0);
+
+ if (((pglob->gl_flags & GLOB_MARK) &&
+ pathend[-1] != SEP) && (S_ISDIR(sb.st_mode))) {
+ if (pathend + 1 > pathend_last)
+ return (GLOB_ABORTED);
+ *pathend++ = SEP;
+ *pathend = EOS;
+ }
+ ++pglob->gl_matchc;
+ return(globextend(pathbuf, pglob, limit));
+ }
+
+ /* Find end of next segment, copy tentatively to pathend. */
+ q = pathend;
+ p = pattern;
+ while (*p != EOS && *p != SEP) {
+ if (ismeta(*p))
+ anymeta = 1;
+ if (q + 1 > pathend_last)
+ return (GLOB_ABORTED);
+ *q++ = *p++;
+ }
+
+ if (!anymeta) { /* No expansion, do next segment. */
+ pathend = q;
+ pattern = p;
+ while (*pattern == SEP) {
+ if (pathend + 1 > pathend_last)
+ return (GLOB_ABORTED);
+ *pathend++ = *pattern++;
+ }
+ } else /* Need expansion, recurse. */
+ return(glob3(pathbuf, pathend, pathend_last, pattern, p,
+ pglob, limit));
+ }
+ /* NOTREACHED */
+}
+
+static int
+glob3(Char *pathbuf, Char *pathend, Char *pathend_last, Char *pattern, Char *restpattern, glob_t *pglob, int *limit)
+{
+ struct dirent *dp;
+ DIR *dirp;
+ int err;
+ char buf[MAXPATHLEN];
+
+ /*
+ * The readdirfunc declaration can't be prototyped, because it is
+ * assigned, below, to two functions which are prototyped in glob.h
+ * and dirent.h as taking pointers to differently typed opaque
+ * structures.
+ */
+ typedef struct dirent *(*readdirfunc_t)(void*);
+ readdirfunc_t readdirfunc;
+
+ if (pathend > pathend_last)
+ return (GLOB_ABORTED);
+ *pathend = EOS;
+ errno = 0;
+
+ if ((dirp = g_opendir(pathbuf, pglob)) == NULL) {
+ /* TODO: don't call for ENOENT or ENOTDIR? */
+ if (pglob->gl_errfunc) {
+ if (g_Ctoc(pathbuf, buf, sizeof(buf)))
+ return (GLOB_ABORTED);
+ if (pglob->gl_errfunc(buf, errno) ||
+ pglob->gl_flags & GLOB_ERR)
+ return (GLOB_ABORTED);
+ }
+ return(0);
+ }
+
+ err = 0;
+
+ /* Search directory for matching names. */
+ if (pglob->gl_flags & GLOB_ALTDIRFUNC)
+ readdirfunc = pglob->gl_readdir;
+ else
+ readdirfunc = (readdirfunc_t)readdir;
+ while ((dp = (*readdirfunc)(dirp))) {
+ u_char *sc;
+ Char *dc;
+ wchar_t wc;
+ size_t clen;
+ mbstate_t mbs;
+
+ /* Initial DOT must be matched literally. */
+ if (dp->d_name[0] == DOT && *pattern != DOT)
+ continue;
+ memset(&mbs, 0, sizeof(mbs));
+ dc = pathend;
+ sc = (u_char *) dp->d_name;
+ while (dc < pathend_last) {
+ clen = mbrtowc(&wc, (const char*)sc, MB_LEN_MAX, &mbs);
+ if (clen == (size_t)-1 || clen == (size_t)-2) {
+ wc = *sc;
+ clen = 1;
+ memset(&mbs, 0, sizeof(mbs));
+ }
+ if ((*dc++ = wc) == EOS)
+ break;
+ sc += clen;
+ }
+ if (!match(pathend, pattern, restpattern)) {
+ *pathend = EOS;
+ continue;
+ }
+ err = glob2(pathbuf, --dc, pathend_last, restpattern,
+ pglob, limit);
+ if (err)
+ break;
+ }
+
+ if (pglob->gl_flags & GLOB_ALTDIRFUNC)
+ (*pglob->gl_closedir)(dirp);
+ else
+ closedir(dirp);
+ return(err);
+}
+
+
+/*
+ * Extend the gl_pathv member of a glob_t structure to accomodate a new item,
+ * add the new item, and update gl_pathc.
+ *
+ * This assumes the BSD realloc, which only copies the block when its size
+ * crosses a power-of-two boundary; for v7 realloc, this would cause quadratic
+ * behavior.
+ *
+ * Return 0 if new item added, error code if memory couldn't be allocated.
+ *
+ * Invariant of the glob_t structure:
+ * Either gl_pathc is zero and gl_pathv is NULL; or gl_pathc > 0 and
+ * gl_pathv points to (gl_offs + gl_pathc + 1) items.
+ */
+static int
+globextend(const Char *path, glob_t *pglob, int *limit)
+{
+ char **pathv;
+ int i;
+ size_t newsize, len;
+ char *copy;
+ const Char *p;
+
+ if (*limit && pglob->gl_pathc > *limit) {
+ errno = 0;
+ return (GLOB_NOSPACE);
+ }
+
+ newsize = sizeof(*pathv) * (2 + pglob->gl_pathc + pglob->gl_offs);
+ pathv = pglob->gl_pathv ?
+ (char**)realloc((char *)pglob->gl_pathv, newsize) :
+ (char**)malloc(newsize);
+ if (pathv == NULL) {
+ if (pglob->gl_pathv) {
+ free(pglob->gl_pathv);
+ pglob->gl_pathv = NULL;
+ }
+ return(GLOB_NOSPACE);
+ }
+
+ if (pglob->gl_pathv == NULL && pglob->gl_offs > 0) {
+ /* first time around -- clear initial gl_offs items */
+ pathv += pglob->gl_offs;
+ for (i = pglob->gl_offs; --i >= 0; )
+ *--pathv = NULL;
+ }
+ pglob->gl_pathv = pathv;
+
+ for (p = path; *p++;)
+ continue;
+ len = MB_CUR_MAX * (size_t)(p - path); /* XXX overallocation */
+ if ((copy = (char*)malloc(len)) != NULL) {
+ if (g_Ctoc(path, copy, (u_int)len)) {
+ free(copy);
+ return (GLOB_NOSPACE);
+ }
+ pathv[pglob->gl_offs + pglob->gl_pathc++] = copy;
+ }
+ pathv[pglob->gl_offs + pglob->gl_pathc] = NULL;
+ return(copy == NULL ? GLOB_NOSPACE : 0);
+}
+
+/*
+ * pattern matching function for filenames. Each occurrence of the *
+ * pattern causes a recursion level.
+ */
+static int
+match(Char *name, Char *pat, Char *patend)
+{
+ int ok, negate_range;
+ Char c, k;
+
+ while (pat < patend) {
+ c = *pat++;
+ switch (c & M_MASK) {
+ case M_ALL:
+ if (pat == patend)
+ return(1);
+ do
+ if (match(name, pat, patend))
+ return(1);
+ while (*name++ != EOS);
+ return(0);
+ case M_ONE:
+ if (*name++ == EOS)
+ return(0);
+ break;
+ case M_SET:
+ ok = 0;
+ if ((k = *name++) == EOS)
+ return(0);
+ if ((negate_range = ((*pat & M_MASK) == M_NOT)) != EOS)
+ ++pat;
+ while (((c = *pat++) & M_MASK) != M_END)
+ if ((*pat & M_MASK) == M_RNG) {
+ if (__collate_load_error ?
+ CHAR(c) <= CHAR(k) && CHAR(k) <= CHAR(pat[1]) :
+ __collate_range_cmp((int)CHAR(c), (int)CHAR(k)) <= 0
+ && __collate_range_cmp((int)CHAR(k), (int)CHAR(pat[1])) <= 0
+ )
+ ok = 1;
+ pat += 2;
+ } else if (c == k)
+ ok = 1;
+ if (ok == negate_range)
+ return(0);
+ break;
+ default:
+ if (*name++ != c)
+ return(0);
+ break;
+ }
+ }
+ return(*name == EOS);
+}
+
+/* Free allocated data belonging to a glob_t structure. */
+void
+globfree(glob_t *pglob)
+{
+ int i;
+ char **pp;
+
+ if (pglob->gl_pathv != NULL) {
+ pp = pglob->gl_pathv + pglob->gl_offs;
+ for (i = pglob->gl_pathc; i--; ++pp)
+ if (*pp)
+ free(*pp);
+ free(pglob->gl_pathv);
+ pglob->gl_pathv = NULL;
+ }
+}
+
+static DIR *
+g_opendir(Char *str, glob_t *pglob)
+{
+ char buf[MAXPATHLEN];
+
+ if (!*str)
+ strcpy(buf, ".");
+ else {
+ if (g_Ctoc(str, buf, sizeof(buf)))
+ return (NULL);
+ }
+
+ if (pglob->gl_flags & GLOB_ALTDIRFUNC)
+ return (DIR*)((*pglob->gl_opendir)(buf));
+
+ return(opendir(buf));
+}
+
+static int
+g_lstat(Char *fn, struct stat *sb, glob_t *pglob)
+{
+ char buf[MAXPATHLEN];
+
+ if (g_Ctoc(fn, buf, sizeof(buf))) {
+ errno = ENAMETOOLONG;
+ return (-1);
+ }
+ if (pglob->gl_flags & GLOB_ALTDIRFUNC)
+ return((*pglob->gl_lstat)(buf, sb));
+ return(lstat(buf, sb));
+}
+
+static Char *
+g_strchr(Char *str, wchar_t ch)
+{
+ do {
+ if (*str == ch)
+ return (str);
+ } while (*str++);
+ return (NULL);
+}
+
+static int
+g_Ctoc(const Char *str, char *buf, u_int len)
+{
+ mbstate_t mbs;
+ size_t clen;
+
+ memset(&mbs, 0, sizeof(mbs));
+ while ((int)len >= MB_CUR_MAX) {
+ clen = wcrtomb(buf, (wchar_t)*str, &mbs);
+ if (clen == (size_t)-1)
+ return (1);
+ if (*str == L'\0')
+ return (0);
+ str++;
+ buf += clen;
+ len -= (u_int)clen;
+ }
+ return (1);
+}
+
+#ifdef DEBUG
+static void
+qprintf(const char *str, Char *s)
+{
+ Char *p;
+
+ (void)printf("%s:\n", str);
+ for (p = s; *p; p++)
+ (void)printf("%c", CHAR(*p));
+ (void)printf("\n");
+ for (p = s; *p; p++)
+ (void)printf("%c", *p & M_PROTECT ? '"' : ' ');
+ (void)printf("\n");
+ for (p = s; *p; p++)
+ (void)printf("%c", ismeta(*p) ? '_' : ' ');
+ (void)printf("\n");
+}
+#endif
+#endif
diff --git a/library/cpp/regex/glob/glob_compat.h b/library/cpp/regex/glob/glob_compat.h
new file mode 100644
index 0000000000..0dc518d51b
--- /dev/null
+++ b/library/cpp/regex/glob/glob_compat.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <util/system/defaults.h>
+
+#if defined(_MSC_VER) || defined(_bionic_)
+#define USE_INTERNAL_GLOB
+#endif
+
+#if !defined(USE_INTERNAL_GLOB)
+#include <glob.h>
+#else
+
+struct stat;
+typedef struct {
+ int gl_pathc; /* Count of total paths so far. */
+ int gl_matchc; /* Count of paths matching pattern. */
+ int gl_offs; /* Reserved at beginning of gl_pathv. */
+ int gl_flags; /* Copy of flags parameter to glob. */
+ char** gl_pathv; /* List of paths matching pattern. */
+ /* Copy of errfunc parameter to glob. */
+ int (*gl_errfunc)(const char*, int);
+
+ /*
+ * Alternate filesystem access methods for glob; replacement
+ * versions of closedir(3), readdir(3), opendir(3), stat(2)
+ * and lstat(2).
+ */
+ void (*gl_closedir)(void*);
+ struct dirent* (*gl_readdir)(void*);
+ void* (*gl_opendir)(const char*);
+ int (*gl_lstat)(const char*, struct stat*);
+ int (*gl_stat)(const char*, struct stat*);
+} glob_t;
+
+//#if __POSIX_VISIBLE >= 199209
+/* Believed to have been introduced in 1003.2-1992 */
+#define GLOB_APPEND 0x0001 /* Append to output from previous call. */
+#define GLOB_DOOFFS 0x0002 /* Use gl_offs. */
+#define GLOB_ERR 0x0004 /* Return on error. */
+#define GLOB_MARK 0x0008 /* Append / to matching directories. */
+#define GLOB_NOCHECK 0x0010 /* Return pattern itself if nothing matches. */
+#define GLOB_NOSORT 0x0020 /* Don't sort. */
+#define GLOB_NOESCAPE 0x2000 /* Disable backslash escaping. */
+
+/* Error values returned by glob(3) */
+#define GLOB_NOSPACE (-1) /* Malloc call failed. */
+#define GLOB_ABORTED (-2) /* Unignored error. */
+#define GLOB_NOMATCH (-3) /* No match and GLOB_NOCHECK was not set. */
+#define GLOB_NOSYS (-4) /* Obsolete: source comptability only. */
+//#endif /* __POSIX_VISIBLE >= 199209 */
+
+//#if __BSD_VISIBLE
+#define GLOB_ALTDIRFUNC 0x0040 /* Use alternately specified directory funcs. */
+#define GLOB_BRACE 0x0080 /* Expand braces ala csh. */
+#define GLOB_MAGCHAR 0x0100 /* Pattern had globbing characters. */
+#define GLOB_NOMAGIC 0x0200 /* GLOB_NOCHECK without magic chars (csh). */
+#define GLOB_QUOTE 0x0400 /* Quote special chars with \. */
+#define GLOB_TILDE 0x0800 /* Expand tilde names from the passwd file. */
+#define GLOB_LIMIT 0x1000 /* limit number of returned paths */
+
+/* source compatibility, these are the old names */
+#define GLOB_MAXPATH GLOB_LIMIT
+#define GLOB_ABEND GLOB_ABORTED
+//#endif /* __BSD_VISIBLE */
+
+int glob(const char*, int, int (*)(const char*, int), glob_t*);
+void globfree(glob_t*);
+
+#endif /* _MSC_VER */
+
+#if !defined(FROM_IMPLEMENTATION)
+#undef USE_INTERNAL_GLOB
+#endif
diff --git a/library/cpp/regex/glob/glob_iterator.cpp b/library/cpp/regex/glob/glob_iterator.cpp
new file mode 100644
index 0000000000..746b49f397
--- /dev/null
+++ b/library/cpp/regex/glob/glob_iterator.cpp
@@ -0,0 +1 @@
+#include "glob_iterator.h"
diff --git a/library/cpp/regex/glob/glob_iterator.h b/library/cpp/regex/glob/glob_iterator.h
new file mode 100644
index 0000000000..e25481e594
--- /dev/null
+++ b/library/cpp/regex/glob/glob_iterator.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "glob_compat.h"
+
+#include <util/generic/noncopyable.h>
+#include <util/generic/string.h>
+#include <util/generic/yexception.h>
+
+class TGlobPaths : TNonCopyable {
+public:
+ TGlobPaths(const char* pattern) {
+ Impl.gl_pathc = 0;
+ int result = glob(pattern, 0, nullptr, &Impl);
+ Y_ENSURE(result == 0 || result == GLOB_NOMATCH, "glob failed");
+ }
+
+ TGlobPaths(const TString& pattern)
+ : TGlobPaths(pattern.data())
+ {
+ }
+
+ ~TGlobPaths() {
+ globfree(&Impl);
+ }
+
+ const char** begin() {
+ return const_cast<const char**>(Impl.gl_pathv);
+ }
+
+ const char** end() {
+ return const_cast<const char**>(Impl.gl_pathv + Impl.gl_pathc);
+ }
+
+private:
+ glob_t Impl;
+};
diff --git a/library/cpp/regex/glob/ya.make b/library/cpp/regex/glob/ya.make
new file mode 100644
index 0000000000..9379742d99
--- /dev/null
+++ b/library/cpp/regex/glob/ya.make
@@ -0,0 +1,12 @@
+LIBRARY()
+
+SRCS(
+ glob.cpp
+ glob_iterator.cpp
+)
+
+PEERDIR(
+ library/cpp/charset
+)
+
+END()
diff --git a/library/cpp/reverse_geocoder/CMakeLists.txt b/library/cpp/reverse_geocoder/CMakeLists.txt
new file mode 100644
index 0000000000..621e95fdb2
--- /dev/null
+++ b/library/cpp/reverse_geocoder/CMakeLists.txt
@@ -0,0 +1,11 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+add_subdirectory(core)
+add_subdirectory(library)
+add_subdirectory(proto)
diff --git a/library/cpp/reverse_geocoder/core/CMakeLists.darwin-x86_64.txt b/library/cpp/reverse_geocoder/core/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..17f6e79c96
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,35 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-reverse_geocoder-core)
+target_link_libraries(cpp-reverse_geocoder-core PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-reverse_geocoder-library
+ cpp-reverse_geocoder-proto
+ cpp-digest-crc32c
+)
+target_sources(cpp-reverse_geocoder-core PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/area_box.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/bbox.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/common.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/edge.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/kv.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/location.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/part.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/point.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/polygon.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/region.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/debug.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/def.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/map.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp
+)
diff --git a/library/cpp/reverse_geocoder/core/CMakeLists.linux-aarch64.txt b/library/cpp/reverse_geocoder/core/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..02361a0a1a
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,36 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-reverse_geocoder-core)
+target_link_libraries(cpp-reverse_geocoder-core PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ cpp-reverse_geocoder-library
+ cpp-reverse_geocoder-proto
+ cpp-digest-crc32c
+)
+target_sources(cpp-reverse_geocoder-core PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/area_box.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/bbox.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/common.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/edge.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/kv.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/location.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/part.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/point.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/polygon.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/region.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/debug.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/def.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/map.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp
+)
diff --git a/library/cpp/reverse_geocoder/core/CMakeLists.linux-x86_64.txt b/library/cpp/reverse_geocoder/core/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..02361a0a1a
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,36 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-reverse_geocoder-core)
+target_link_libraries(cpp-reverse_geocoder-core PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ cpp-reverse_geocoder-library
+ cpp-reverse_geocoder-proto
+ cpp-digest-crc32c
+)
+target_sources(cpp-reverse_geocoder-core PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/area_box.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/bbox.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/common.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/edge.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/kv.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/location.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/part.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/point.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/polygon.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/region.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/debug.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/def.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/map.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp
+)
diff --git a/library/cpp/reverse_geocoder/core/CMakeLists.txt b/library/cpp/reverse_geocoder/core/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/reverse_geocoder/core/CMakeLists.windows-x86_64.txt b/library/cpp/reverse_geocoder/core/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..17f6e79c96
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,35 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-reverse_geocoder-core)
+target_link_libraries(cpp-reverse_geocoder-core PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-reverse_geocoder-library
+ cpp-reverse_geocoder-proto
+ cpp-digest-crc32c
+)
+target_sources(cpp-reverse_geocoder-core PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/area_box.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/bbox.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/common.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/edge.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/kv.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/location.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/part.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/point.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/polygon.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/region.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/debug.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/def.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/map.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp
+)
diff --git a/library/cpp/reverse_geocoder/core/area_box.cpp b/library/cpp/reverse_geocoder/core/area_box.cpp
new file mode 100644
index 0000000000..67038fe4f8
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/area_box.cpp
@@ -0,0 +1,9 @@
+#include "area_box.h"
+
+using namespace NReverseGeocoder;
+
+TRef NReverseGeocoder::LookupAreaBox(const TPoint& point) {
+ const TRef boxX = (point.X - NAreaBox::LowerX) / NAreaBox::DeltaX;
+ const TRef boxY = (point.Y - NAreaBox::LowerY) / NAreaBox::DeltaY;
+ return boxX * NAreaBox::NumberY + boxY;
+}
diff --git a/library/cpp/reverse_geocoder/core/area_box.h b/library/cpp/reverse_geocoder/core/area_box.h
new file mode 100644
index 0000000000..1077a65fef
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/area_box.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "common.h"
+#include "point.h"
+
+namespace NReverseGeocoder {
+ namespace NAreaBox {
+ const TCoordinate LowerX = ToCoordinate(-180.0);
+ const TCoordinate UpperX = ToCoordinate(180.0);
+ const TCoordinate LowerY = ToCoordinate(-90.0);
+ const TCoordinate UpperY = ToCoordinate(90.0);
+ const TCoordinate DeltaX = ToCoordinate(0.1);
+ const TCoordinate DeltaY = ToCoordinate(0.1);
+ const TCoordinate NumberX = (UpperX - LowerX) / DeltaX;
+ const TCoordinate NumberY = (UpperY - LowerY) / DeltaY;
+ const TCoordinate Number = NumberX * NumberY;
+
+ }
+
+ // Area of geo territory. Variable PolygonRefsOffset refers to the polygons lying inside this
+ // area. Geo map is divided into equal bounding boxes from (NAreaBox::LowerX, NAreaBox::LowerY)
+ // to (NAreaBox::UpperX, NAreaBox::UpperY) with DeltaX and DeltaY sizes. Logic of filling is in
+ // generator.
+ struct Y_PACKED TAreaBox {
+ TNumber PolygonRefsOffset;
+ TNumber PolygonRefsNumber;
+ };
+
+ static_assert(sizeof(TAreaBox) == 8, "NReverseGeocoder::TAreaBox size mismatch");
+
+ // Determine in wich area box in geoData is point.
+ TRef LookupAreaBox(const TPoint& point);
+
+}
diff --git a/library/cpp/reverse_geocoder/core/bbox.cpp b/library/cpp/reverse_geocoder/core/bbox.cpp
new file mode 100644
index 0000000000..aa4258ac22
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/bbox.cpp
@@ -0,0 +1 @@
+#include "bbox.h"
diff --git a/library/cpp/reverse_geocoder/core/bbox.h b/library/cpp/reverse_geocoder/core/bbox.h
new file mode 100644
index 0000000000..e8b6e00aa3
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/bbox.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "common.h"
+#include "point.h"
+
+#include <util/generic/utility.h>
+
+namespace NReverseGeocoder {
+ struct Y_PACKED TBoundingBox {
+ TCoordinate X1;
+ TCoordinate Y1;
+ TCoordinate X2;
+ TCoordinate Y2;
+
+ TBoundingBox()
+ : X1(0)
+ , Y1(0)
+ , X2(0)
+ , Y2(0)
+ {
+ }
+
+ TBoundingBox(TCoordinate x1, TCoordinate y1, TCoordinate x2, TCoordinate y2)
+ : X1(x1)
+ , Y1(y1)
+ , X2(x2)
+ , Y2(y2)
+ {
+ }
+
+ TBoundingBox(const TPoint* points, TNumber number) {
+ Init();
+ for (TNumber i = 0; i < number; ++i)
+ Relax(points[i]);
+ }
+
+ void Init() {
+ X1 = ToCoordinate(180.0);
+ Y1 = ToCoordinate(90.0);
+ X2 = ToCoordinate(-180.0);
+ Y2 = ToCoordinate(-90.0);
+ }
+
+ void Relax(const TPoint& p) {
+ X1 = Min(X1, p.X);
+ Y1 = Min(Y1, p.Y);
+ X2 = Max(X2, p.X);
+ Y2 = Max(Y2, p.Y);
+ }
+
+ bool HasIntersection(const TBoundingBox& r) const {
+ if (X1 > r.X2 || X2 < r.X1 || Y1 > r.Y2 || Y2 < r.Y1)
+ return false;
+ return true;
+ }
+
+ bool Contains(const TPoint& p) const {
+ if (p.X < X1 || p.X > X2 || p.Y < Y1 || p.Y > Y2)
+ return false;
+ return true;
+ }
+ };
+
+ static_assert(sizeof(TBoundingBox) == 16, "NReverseGeocoder::TBoundingBox size mismatch");
+
+}
diff --git a/library/cpp/reverse_geocoder/core/common.cpp b/library/cpp/reverse_geocoder/core/common.cpp
new file mode 100644
index 0000000000..67c02a20a0
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/common.cpp
@@ -0,0 +1 @@
+#include "common.h"
diff --git a/library/cpp/reverse_geocoder/core/common.h b/library/cpp/reverse_geocoder/core/common.h
new file mode 100644
index 0000000000..090407ffd9
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/common.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <util/system/compiler.h>
+#include <util/system/types.h>
+
+namespace NReverseGeocoder {
+ using TCoordinate = i32;
+ using TGeoId = ui64;
+ using TNumber = ui32;
+ using TRef = ui32;
+ using TSquare = i64;
+ using TVersion = ui64;
+
+ const double EARTH_RADIUS = 6371000.0;
+
+ inline TCoordinate ToCoordinate(double x) {
+ return x * 1e6;
+ }
+
+ inline double ToDouble(TCoordinate x) {
+ return x / 1e6;
+ }
+
+}
diff --git a/library/cpp/reverse_geocoder/core/edge.cpp b/library/cpp/reverse_geocoder/core/edge.cpp
new file mode 100644
index 0000000000..86c6ab8535
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/edge.cpp
@@ -0,0 +1 @@
+#include "edge.h"
diff --git a/library/cpp/reverse_geocoder/core/edge.h b/library/cpp/reverse_geocoder/core/edge.h
new file mode 100644
index 0000000000..9d20928857
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/edge.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include "common.h"
+#include "point.h"
+
+#include <util/generic/utility.h>
+#include <util/system/yassert.h>
+
+namespace NReverseGeocoder {
+ // TEdge is a type, which represent polygon edge, Beg/End refers on begin/End edge points in
+ // geographical data.
+ struct Y_PACKED TEdge {
+ TRef Beg;
+ TRef End;
+
+ TEdge()
+ : Beg(0)
+ , End(0)
+ {
+ }
+
+ TEdge(const TRef& a, const TRef& b)
+ : Beg(a)
+ , End(b)
+ {
+ }
+
+ bool operator==(const TEdge& e) const {
+ return Beg == e.Beg && End == e.End;
+ }
+
+ bool operator!=(const TEdge& e) const {
+ return Beg != e.Beg || End != e.End;
+ }
+
+ bool operator<(const TEdge& e) const {
+ return Beg < e.Beg || (Beg == e.Beg && End < e.End);
+ }
+
+ // Checks that current edge is lying lower then other edge. Both edges must have a common X
+ // values, otherwise the behavior is undefined.
+ bool Lower(const TEdge& e, const TPoint* points) const {
+ if (*this == e)
+ return false;
+
+ const TPoint& a1 = points[Beg];
+ const TPoint& a2 = points[End];
+ const TPoint& b1 = points[e.Beg];
+ const TPoint& b2 = points[e.End];
+
+ Y_ASSERT(a1.X <= a2.X && b1.X <= b2.X);
+
+ if (a1 == b1) {
+ return (a2 - a1).Cross(b2 - a1) > 0;
+ } else if (a2 == b2) {
+ return (a1 - b1).Cross(b2 - b1) > 0;
+ } else if (b1.X >= a1.X && b1.X <= a2.X) {
+ return (a2 - a1).Cross(b1 - a1) > 0;
+ } else if (b2.X >= a1.X && b2.X <= a2.X) {
+ return (a2 - a1).Cross(b2 - a1) > 0;
+ } else if (a1.X >= b1.X && a1.X <= b2.X) {
+ return (a1 - b1).Cross(b2 - b1) > 0;
+ } else if (a2.X >= b1.X && a2.X <= b2.X) {
+ return (a2 - b1).Cross(b2 - b1) > 0;
+ } else {
+ return false;
+ }
+ }
+
+ // Checks that current edge lying lower then given point. Edge and point must have a common X
+ // values, otherwise the behavior is undefined.
+ bool Lower(const TPoint& p, const TPoint* points) const {
+ if (Contains(p, points))
+ return false;
+
+ TPoint a = points[Beg];
+ TPoint b = points[End];
+
+ if (a.X > b.X)
+ DoSwap(a, b);
+
+ return (b - a).Cross(p - a) > 0;
+ }
+
+ bool Contains(const TPoint& p, const TPoint* points) const {
+ TPoint a = points[Beg];
+ TPoint b = points[End];
+
+ if (a.X > b.X)
+ DoSwap(a, b);
+
+ if (p.X < a.X || p.X > b.X)
+ return false;
+
+ return (b - a).Cross(p - a) == 0;
+ }
+ };
+
+ static_assert(sizeof(TEdge) == 8, "NReverseGeocoder::TEdge size mismatch");
+
+}
diff --git a/library/cpp/reverse_geocoder/core/geo_data/debug.cpp b/library/cpp/reverse_geocoder/core/geo_data/debug.cpp
new file mode 100644
index 0000000000..4db0534b22
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/geo_data/debug.cpp
@@ -0,0 +1,74 @@
+#include "debug.h"
+
+#include <library/cpp/reverse_geocoder/library/log.h>
+#include <library/cpp/reverse_geocoder/library/memory.h>
+
+using namespace NReverseGeocoder;
+using namespace NGeoData;
+
+size_t NReverseGeocoder::NGeoData::Space(const IGeoData& g) {
+ size_t space = 0;
+
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+ space += sizeof(TVar);
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+ space += sizeof(TNumber) + sizeof(TArr) * g.Arr##Number();
+
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+
+ return space;
+}
+
+template <typename TArr>
+static float ArraySpace(TNumber number) {
+ return number * sizeof(TArr) * 1.0 / MB;
+}
+
+void NReverseGeocoder::NGeoData::Show(IOutputStream& out, const IGeoData& g) {
+ out << "GeoData = " << NGeoData::Space(g) * 1.0 / GB << " GB" << '\n';
+
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+ out << " GeoData." << #Var << " = " << (unsigned long long)g.Var() << '\n';
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+ out << " GeoData." << #Arr << " = " \
+ << g.Arr##Number() << " x " << sizeof(TArr) << " = " \
+ << ArraySpace<TArr>(g.Arr##Number()) << " MB" \
+ << '\n';
+
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+}
+
+template <typename TArr>
+static bool Equals(const TArr* a, const TArr* b, size_t count) {
+ return !memcmp(a, b, sizeof(TArr) * count);
+}
+
+bool NReverseGeocoder::NGeoData::Equals(const IGeoData& a, const IGeoData& b) {
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+ if (a.Var() != b.Var()) { \
+ LogError(#Var " not equal"); \
+ return false; \
+ }
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+ GEO_BASE_DEF_VAR(TNumber, Arr##Number); \
+ if (!::Equals(a.Arr(), b.Arr(), a.Arr##Number())) { \
+ LogError(#Arr " not equal"); \
+ return false; \
+ }
+
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+
+ return true;
+}
diff --git a/library/cpp/reverse_geocoder/core/geo_data/debug.h b/library/cpp/reverse_geocoder/core/geo_data/debug.h
new file mode 100644
index 0000000000..e7a4d9029c
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/geo_data/debug.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "geo_data.h"
+
+#include <util/stream/output.h>
+
+namespace NReverseGeocoder {
+ namespace NGeoData {
+ size_t Space(const IGeoData& g);
+
+ void Show(IOutputStream& out, const IGeoData& g);
+
+ bool Equals(const IGeoData& a, const IGeoData& b);
+
+ }
+}
diff --git a/library/cpp/reverse_geocoder/core/geo_data/def.cpp b/library/cpp/reverse_geocoder/core/geo_data/def.cpp
new file mode 100644
index 0000000000..bb9f760d73
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/geo_data/def.cpp
@@ -0,0 +1 @@
+#include "def.h"
diff --git a/library/cpp/reverse_geocoder/core/geo_data/def.h b/library/cpp/reverse_geocoder/core/geo_data/def.h
new file mode 100644
index 0000000000..d3e331d873
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/geo_data/def.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <library/cpp/reverse_geocoder/core/area_box.h>
+#include <library/cpp/reverse_geocoder/core/common.h>
+#include <library/cpp/reverse_geocoder/core/edge.h>
+#include <library/cpp/reverse_geocoder/core/kv.h>
+#include <library/cpp/reverse_geocoder/core/part.h>
+#include <library/cpp/reverse_geocoder/core/point.h>
+#include <library/cpp/reverse_geocoder/core/polygon.h>
+#include <library/cpp/reverse_geocoder/core/region.h>
+
+namespace NReverseGeocoder {
+ const TVersion GEO_DATA_VERSION_0 = 0;
+ const TVersion GEO_DATA_VERSION_1 = 1;
+
+ const TVersion GEO_DATA_CURRENT_VERSION = GEO_DATA_VERSION_1;
+
+// Geographical data definition. This define need for reflection in map/unmap, show, etc.
+#define GEO_BASE_DEF_GEO_DATA \
+ GEO_BASE_DEF_VAR(TVersion, Version); \
+ GEO_BASE_DEF_ARR(TPoint, Points); \
+ GEO_BASE_DEF_ARR(TEdge, Edges); \
+ GEO_BASE_DEF_ARR(TRef, EdgeRefs); \
+ GEO_BASE_DEF_ARR(TPart, Parts); \
+ GEO_BASE_DEF_ARR(TPolygon, Polygons); \
+ GEO_BASE_DEF_ARR(TRef, PolygonRefs); \
+ GEO_BASE_DEF_ARR(TAreaBox, Boxes); \
+ GEO_BASE_DEF_ARR(char, Blobs); \
+ GEO_BASE_DEF_ARR(TKv, Kvs); \
+ GEO_BASE_DEF_ARR(TRegion, Regions); \
+ GEO_BASE_DEF_ARR(TRawPolygon, RawPolygons); \
+ GEO_BASE_DEF_ARR(TRef, RawEdgeRefs); \
+ // #define GEO_BASE_DEF_GEO_DATA
+
+}
diff --git a/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp b/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp
new file mode 100644
index 0000000000..be3310b291
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/geo_data/geo_data.cpp
@@ -0,0 +1 @@
+#include "geo_data.h"
diff --git a/library/cpp/reverse_geocoder/core/geo_data/geo_data.h b/library/cpp/reverse_geocoder/core/geo_data/geo_data.h
new file mode 100644
index 0000000000..7cb76bcddc
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/geo_data/geo_data.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "def.h"
+
+namespace NReverseGeocoder {
+ class IGeoData {
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+ virtual const TVar& Var() const = 0;
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+ virtual const TArr* Arr() const = 0; \
+ virtual TNumber Arr##Number() const = 0;
+
+ public:
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+
+ virtual ~IGeoData() {
+ }
+ };
+
+}
diff --git a/library/cpp/reverse_geocoder/core/geo_data/map.cpp b/library/cpp/reverse_geocoder/core/geo_data/map.cpp
new file mode 100644
index 0000000000..312f7d7cb0
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/geo_data/map.cpp
@@ -0,0 +1,203 @@
+#include "map.h"
+
+#include <library/cpp/reverse_geocoder/library/log.h>
+#include <library/cpp/reverse_geocoder/library/system.h>
+#include <library/cpp/reverse_geocoder/proto/geo_data.pb.h>
+
+#include <library/cpp/digest/crc32c/crc32c.h>
+
+#include <util/generic/algorithm.h>
+#include <util/generic/buffer.h>
+#include <util/generic/vector.h>
+#include <util/network/address.h>
+#include <util/system/filemap.h>
+#include <util/system/unaligned_mem.h>
+
+using namespace NReverseGeocoder;
+
+static const TNumber CRC_SIZE = 3;
+
+void NReverseGeocoder::TGeoDataMap::Init() {
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+ Var##_ = TVar();
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+ Arr##_ = nullptr; \
+ Arr##Number_ = 0;
+
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+}
+
+NReverseGeocoder::TGeoDataMap::TGeoDataMap()
+ : Data_(nullptr)
+ , Size_(0)
+{
+ Init();
+}
+
+static bool CheckMemoryConsistency(const NProto::TGeoData& g) {
+ TVector<std::pair<intptr_t, intptr_t>> segments;
+
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+ // undef
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+ if (g.Get##Arr##Number() > 0) { \
+ intptr_t const beg = g.Get##Arr(); \
+ intptr_t const end = g.Get##Arr() + g.Get##Arr##Number() * sizeof(TArr); \
+ segments.emplace_back(beg, end); \
+ }
+
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+
+ Sort(segments.begin(), segments.end());
+
+ for (size_t i = 0; i + 1 < segments.size(); ++i)
+ if (segments[i].second > segments[i + 1].first)
+ return false;
+
+ return true;
+}
+
+void NReverseGeocoder::TGeoDataMap::Remap() {
+ Init();
+
+ if (!Data_)
+ return;
+
+ const ui64 headerSize = ntohl(ReadUnaligned<ui64>(Data_));
+
+ NProto::TGeoData header;
+ if (!header.ParseFromArray(Data_ + sizeof(ui64), headerSize))
+ ythrow yexception() << "Unable parse geoData header";
+
+ if (header.GetMagic() != SYSTEM_ENDIAN_FLAG)
+ ythrow yexception() << "Different endianness in geoData and host";
+
+ if (!CheckMemoryConsistency(header))
+ ythrow yexception() << "Memory is not consistent!";
+
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+ Var##_ = header.Get##Var();
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+ GEO_BASE_DEF_VAR(TNumber, Arr##Number); \
+ if (Arr##Number() > 0) { \
+ const intptr_t offset = header.Get##Arr(); \
+ Arr##_ = (TArr*)(((intptr_t)Data_) + offset); \
+ const ui32 hash = Crc32c(Arr##_, std::min(Arr##Number_, CRC_SIZE) * sizeof(TArr)); \
+ if (hash != header.Get##Arr##Crc32()) \
+ ythrow yexception() << "Wrong crc32 for " << #Arr; \
+ }
+
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+
+ if (Version() != GEO_DATA_CURRENT_VERSION)
+ ythrow yexception() << "Unable use version " << Version()
+ << "(current version is " << GEO_DATA_CURRENT_VERSION << ")";
+}
+
+static size_t HeaderSize() {
+ NProto::TGeoData header;
+ header.SetMagic(std::numeric_limits<decltype(header.GetMagic())>::max());
+
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+ header.Set##Var(std::numeric_limits<decltype(header.Get##Var())>::max());
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+ GEO_BASE_DEF_VAR(TNumber, Arr##Number); \
+ header.Set##Arr(std::numeric_limits<decltype(header.Get##Arr())>::max()); \
+ header.Set##Arr##Crc32(std::numeric_limits<decltype(header.Get##Arr##Crc32())>::max());
+
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+
+ return header.ByteSize();
+}
+
+static const char* Serialize(const IGeoData& g, TBlockAllocator* allocator, size_t* size) {
+ size_t const preAllocatedSize = allocator->TotalAllocatedSize();
+ char* data = (char*)allocator->Allocate(HeaderSize() + sizeof(ui64));
+
+ NProto::TGeoData header;
+ header.SetMagic(SYSTEM_ENDIAN_FLAG);
+
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+ header.Set##Var(g.Var());
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+ GEO_BASE_DEF_VAR(TNumber, Arr##Number); \
+ if (g.Arr##Number() > 0) { \
+ TArr* arr = (TArr*)allocator->Allocate(sizeof(TArr) * g.Arr##Number()); \
+ memcpy(arr, g.Arr(), sizeof(TArr) * g.Arr##Number()); \
+ header.Set##Arr((ui64)(((intptr_t)arr) - ((intptr_t)data))); \
+ header.Set##Arr##Crc32(Crc32c(arr, std::min(g.Arr##Number(), CRC_SIZE) * sizeof(TArr))); \
+ };
+
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+
+ const auto str = header.SerializeAsString();
+ WriteUnaligned<ui64>(data, (ui64)htonl(str.size()));
+ memcpy(data + sizeof(ui64), str.data(), str.size());
+
+ if (size)
+ *size = allocator->TotalAllocatedSize() - preAllocatedSize;
+
+ return data;
+}
+
+static size_t TotalByteSize(const IGeoData& g) {
+ size_t total_size = TBlockAllocator::AllocateSize(HeaderSize() + sizeof(ui64));
+
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+ // undef
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+ total_size += TBlockAllocator::AllocateSize(sizeof(TArr) * g.Arr##Number());
+
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+
+ return total_size;
+}
+
+NReverseGeocoder::TGeoDataMap::TGeoDataMap(const IGeoData& geoData, TBlockAllocator* allocator)
+ : TGeoDataMap()
+{
+ Data_ = Serialize(geoData, allocator, &Size_);
+ Remap();
+}
+
+void NReverseGeocoder::TGeoDataMap::SerializeToFile(const TString& path, const IGeoData& data) {
+ TBlob data_blob = SerializeToBlob(data);
+
+ TFile file(path, CreateAlways | RdWr);
+ file.Write(data_blob.Data(), data_blob.Length());
+}
+
+TBlob NReverseGeocoder::TGeoDataMap::SerializeToBlob(const IGeoData& data) {
+ TBuffer buf;
+ buf.Resize(TotalByteSize(data));
+ memset(buf.data(), 0, buf.size());
+
+ TBlockAllocator allocator(buf.Data(), buf.Size());
+ TGeoDataMap(data, &allocator);
+
+ return TBlob::FromBuffer(buf);
+}
diff --git a/library/cpp/reverse_geocoder/core/geo_data/map.h b/library/cpp/reverse_geocoder/core/geo_data/map.h
new file mode 100644
index 0000000000..e466bd912e
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/geo_data/map.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include "geo_data.h"
+
+#include <library/cpp/reverse_geocoder/library/block_allocator.h>
+
+#include <util/memory/blob.h>
+
+namespace NReverseGeocoder {
+ class TGeoDataMap: public IGeoData, public TNonCopyable {
+#define GEO_BASE_DEF_VAR(TVar, Var) \
+public: \
+ const TVar& Var() const override { \
+ return Var##_; \
+ } \
+ \
+private: \
+ TVar Var##_;
+
+#define GEO_BASE_DEF_ARR(TArr, Arr) \
+public: \
+ const TArr* Arr() const override { \
+ return Arr##_; \
+ } \
+ TNumber Arr##Number() const override { \
+ return Arr##Number_; \
+ } \
+ \
+private: \
+ TNumber Arr##Number_; \
+ const TArr* Arr##_;
+
+ GEO_BASE_DEF_GEO_DATA
+
+#undef GEO_BASE_DEF_VAR
+#undef GEO_BASE_DEF_ARR
+
+ public:
+ TGeoDataMap();
+
+ static void SerializeToFile(const TString& path, const IGeoData& data);
+
+ static TBlob SerializeToBlob(const IGeoData& data);
+
+ TGeoDataMap(const IGeoData& data, TBlockAllocator* allocator);
+
+ TGeoDataMap(const char* data, size_t size)
+ : TGeoDataMap()
+ {
+ Data_ = data;
+ Size_ = size;
+ Remap();
+ }
+
+ TGeoDataMap(TGeoDataMap&& dat)
+ : TGeoDataMap()
+ {
+ DoSwap(Data_, dat.Data_);
+ DoSwap(Size_, dat.Size_);
+ Remap();
+ dat.Remap();
+ }
+
+ TGeoDataMap& operator=(TGeoDataMap&& dat) {
+ DoSwap(Data_, dat.Data_);
+ DoSwap(Size_, dat.Size_);
+ Remap();
+ dat.Remap();
+ return *this;
+ }
+
+ const char* Data() const {
+ return Data_;
+ }
+
+ size_t Size() const {
+ return Size_;
+ }
+
+ private:
+ void Init();
+
+ void Remap();
+
+ const char* Data_;
+ size_t Size_;
+ };
+
+}
diff --git a/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp b/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp
new file mode 100644
index 0000000000..5ff2d13783
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/geo_data/proxy.cpp
@@ -0,0 +1 @@
+#include "proxy.h"
diff --git a/library/cpp/reverse_geocoder/core/geo_data/proxy.h b/library/cpp/reverse_geocoder/core/geo_data/proxy.h
new file mode 100644
index 0000000000..fecb9fc7cf
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/geo_data/proxy.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include "geo_data.h"
+#include "map.h"
+
+#include <util/generic/ptr.h>
+#include <util/system/filemap.h>
+
+namespace NReverseGeocoder {
+ class IGeoDataProxy {
+ public:
+ virtual const IGeoData* GeoData() const = 0;
+
+ virtual ~IGeoDataProxy() {
+ }
+ };
+
+ using TGeoDataProxyPtr = THolder<IGeoDataProxy>;
+
+ class TGeoDataMapProxy: public IGeoDataProxy, public TNonCopyable {
+ public:
+ explicit TGeoDataMapProxy(const char* path)
+ : MemFile_(path)
+ {
+ MemFile_.Map(0, MemFile_.Length());
+ GeoData_ = TGeoDataMap((const char*)MemFile_.Ptr(), MemFile_.MappedSize());
+ }
+
+ const IGeoData* GeoData() const override {
+ return &GeoData_;
+ }
+
+ private:
+ TFileMap MemFile_;
+ TGeoDataMap GeoData_;
+ };
+
+ class TGeoDataWrapper: public IGeoDataProxy, public TNonCopyable {
+ public:
+ explicit TGeoDataWrapper(const IGeoData& g)
+ : GeoData_(&g)
+ {
+ }
+
+ const IGeoData* GeoData() const override {
+ return GeoData_;
+ }
+
+ private:
+ const IGeoData* GeoData_;
+ };
+
+ class TGeoDataRawProxy: public IGeoDataProxy, public TNonCopyable {
+ public:
+ TGeoDataRawProxy(const char* data, size_t dataSize)
+ : GeoData_(data, dataSize)
+ {
+ }
+
+ const IGeoData* GeoData() const override {
+ return &GeoData_;
+ }
+
+ private:
+ TGeoDataMap GeoData_;
+ };
+
+}
diff --git a/library/cpp/reverse_geocoder/core/kv.cpp b/library/cpp/reverse_geocoder/core/kv.cpp
new file mode 100644
index 0000000000..a48e9c947e
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/kv.cpp
@@ -0,0 +1 @@
+#include "kv.h"
diff --git a/library/cpp/reverse_geocoder/core/kv.h b/library/cpp/reverse_geocoder/core/kv.h
new file mode 100644
index 0000000000..639c21de52
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/kv.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "common.h"
+
+namespace NReverseGeocoder {
+ // k and v is offsets on blobs in geographical data blobs array. See geo_data.h
+ // for details.
+ struct TKv {
+ TNumber K;
+ TNumber V;
+ };
+
+}
diff --git a/library/cpp/reverse_geocoder/core/location.cpp b/library/cpp/reverse_geocoder/core/location.cpp
new file mode 100644
index 0000000000..b2d2f54d12
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/location.cpp
@@ -0,0 +1 @@
+#include "location.h"
diff --git a/library/cpp/reverse_geocoder/core/location.h b/library/cpp/reverse_geocoder/core/location.h
new file mode 100644
index 0000000000..5aa3198684
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/location.h
@@ -0,0 +1,21 @@
+#pragma once
+
+namespace NReverseGeocoder {
+ struct TLocation {
+ double Lon;
+ double Lat;
+
+ TLocation()
+ : Lon(0)
+ , Lat(0)
+ {
+ }
+
+ TLocation(double lon, double lat)
+ : Lon(lon)
+ , Lat(lat)
+ {
+ }
+ };
+
+}
diff --git a/library/cpp/reverse_geocoder/core/part.cpp b/library/cpp/reverse_geocoder/core/part.cpp
new file mode 100644
index 0000000000..c973d2171a
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/part.cpp
@@ -0,0 +1,29 @@
+#include "part.h"
+
+#include <library/cpp/reverse_geocoder/library/unaligned_iter.h>
+
+#include <util/generic/algorithm.h>
+
+using namespace NReverseGeocoder;
+
+bool NReverseGeocoder::TPart::Contains(const TPoint& point, TNumber edgeRefsNumber, const TRef* edgeRefs,
+ const TEdge* edges, const TPoint* points) const {
+ auto edgeRefsBegin = UnalignedIter(edgeRefs) + EdgeRefsOffset;
+ auto edgeRefsEnd = edgeRefsBegin + edgeRefsNumber;
+
+ // Find lower bound edge, which lying below given point.
+ auto cmp = [&](const TRef& e, const TPoint& p) {
+ return edges[e].Lower(p, points);
+ };
+
+ auto edgeRef = LowerBound(edgeRefsBegin, edgeRefsEnd, point, cmp);
+
+ if (edgeRef == edgeRefsEnd)
+ return false;
+
+ if (edges[*edgeRef].Contains(point, points))
+ return true;
+
+ // If the point is inside of the polygon then it will intersect the edge an odd number of times.
+ return (edgeRef - edgeRefsBegin) % 2 == 1;
+}
diff --git a/library/cpp/reverse_geocoder/core/part.h b/library/cpp/reverse_geocoder/core/part.h
new file mode 100644
index 0000000000..9b24fee96f
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/part.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "common.h"
+#include "edge.h"
+#include "point.h"
+
+namespace NReverseGeocoder {
+ // TPart contains version of persistent scanline. Parts lying in geofraphical data parts array,
+ // ordered by Coordinate for each polygon. Variable EdgeRefsOffset refers on EdgeRefs array for
+ // this part. For optimal usage of memory, part does not contain "EdgeRefsNumber" variable, because
+ // it's can be computed as parts[i + 1].EdgeRefsOffset - parts[i].EdgeRefsOffset for every part
+ // in geographical data. Especially for this, added fake part into IGeoData with correct
+ // EdgeRefsOffset. Refs in EdgeRefs are in increasing order for each part. It is necessary to
+ // quickly determine how many edges is under the point. See generator/ for details.
+ struct Y_PACKED TPart {
+ TCoordinate Coordinate;
+ TNumber EdgeRefsOffset;
+
+ // Checks point lying under odd numbers of edges or on edge.
+ bool Contains(const TPoint& point, TNumber edgeRefsNumber, const TRef* edgeRefs,
+ const TEdge* edges, const TPoint* points) const;
+ };
+
+ static_assert(sizeof(TPart) == 8, "NReverseGeocoder::TPart size mismatch");
+
+}
diff --git a/library/cpp/reverse_geocoder/core/point.cpp b/library/cpp/reverse_geocoder/core/point.cpp
new file mode 100644
index 0000000000..396e27e596
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/point.cpp
@@ -0,0 +1 @@
+#include "point.h"
diff --git a/library/cpp/reverse_geocoder/core/point.h b/library/cpp/reverse_geocoder/core/point.h
new file mode 100644
index 0000000000..75f1dfc1b4
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/point.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "common.h"
+#include "location.h"
+
+namespace NReverseGeocoder {
+ struct Y_PACKED TPoint {
+ TCoordinate X;
+ TCoordinate Y;
+
+ TPoint()
+ : X(0)
+ , Y(0)
+ {
+ }
+
+ TPoint(const TCoordinate& x1, const TCoordinate& y1)
+ : X(x1)
+ , Y(y1)
+ {
+ }
+
+ explicit TPoint(const TLocation& l)
+ : X(ToCoordinate(l.Lon))
+ , Y(ToCoordinate(l.Lat))
+ {
+ }
+
+ TPoint operator-(const TPoint& p) const {
+ return TPoint(X - p.X, Y - p.Y);
+ }
+
+ bool operator==(const TPoint& b) const {
+ return X == b.X && Y == b.Y;
+ }
+
+ bool operator!=(const TPoint& b) const {
+ return X != b.X || Y != b.Y;
+ }
+
+ bool operator<(const TPoint& b) const {
+ return X < b.X || (X == b.X && Y < b.Y);
+ }
+
+ TSquare Cross(const TPoint& p) const {
+ return 1ll * X * p.Y - 1ll * Y * p.X;
+ }
+ };
+
+ static_assert(sizeof(TPoint) == 8, "NReverseGeocoder::TPoint size mismatch");
+
+}
diff --git a/library/cpp/reverse_geocoder/core/polygon.cpp b/library/cpp/reverse_geocoder/core/polygon.cpp
new file mode 100644
index 0000000000..2baac2d229
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/polygon.cpp
@@ -0,0 +1,91 @@
+#include "polygon.h"
+
+#include <util/generic/algorithm.h>
+
+using namespace NReverseGeocoder;
+
+static bool Check(const TPart* part, const TPoint& point, const TRef* edgeRefs,
+ const TEdge* edges, const TPoint* points) {
+ const TNumber edgeRefsNumber = (part + 1)->EdgeRefsOffset - part->EdgeRefsOffset;
+ return part->Contains(point, edgeRefsNumber, edgeRefs, edges, points);
+}
+
+bool NReverseGeocoder::TPolygon::Contains(const TPoint& point, const TPart* parts, const TRef* edgeRefs,
+ const TEdge* edges, const TPoint* points) const {
+ if (!Bbox.Contains(point))
+ return false;
+
+ parts += PartsOffset;
+ const TPart* partsEnd = parts + PartsNumber;
+
+ // Find lower bound part, which can contains given point.
+ const TPart* part = LowerBound(parts, partsEnd, point, [&](const TPart& a, const TPoint& b) {
+ return a.Coordinate < b.X;
+ });
+
+ if (part->Coordinate > point.X) {
+ if (part == parts)
+ return false;
+ --part;
+ }
+
+ if (point.X < part->Coordinate || point.X > (part + 1)->Coordinate)
+ return false;
+
+ if (point.X == part->Coordinate)
+ if (part != parts && Check(part - 1, point, edgeRefs, edges, points))
+ return true;
+
+ return Check(part, point, edgeRefs, edges, points);
+}
+
+bool NReverseGeocoder::TPolygonBase::Better(const TPolygonBase& p, const TRegion* regions,
+ TNumber regionsNumber) const {
+ if (Square < p.Square)
+ return true;
+
+ if (Square == p.Square) {
+ const TRegion* begin = regions;
+ const TRegion* end = regions + regionsNumber;
+
+ const TRegion* r1 = LowerBound(begin, end, TGeoId(RegionId));
+ const TRegion* r2 = LowerBound(begin, end, TGeoId(p.RegionId));
+
+ if (r1 == end || r1->RegionId != RegionId)
+ return false;
+
+ if (r2 == end || r2->RegionId != p.RegionId)
+ return false;
+
+ return r1->Better(*r2);
+ }
+
+ return false;
+}
+
+bool NReverseGeocoder::TRawPolygon::Contains(const TPoint& point, const TRef* edgeRefs, const TEdge* edges,
+ const TPoint* points) const {
+ if (!Bbox.Contains(point))
+ return false;
+
+ edgeRefs += EdgeRefsOffset;
+
+ TNumber intersections = 0;
+ for (TNumber i = 0; i < EdgeRefsNumber; ++i) {
+ const TEdge& e = edges[edgeRefs[i]];
+
+ if (e.Contains(point, points))
+ return true;
+
+ TPoint a = points[e.Beg];
+ TPoint b = points[e.End];
+
+ if (a.X > b.X)
+ DoSwap(a, b);
+
+ if (a.X < point.X && b.X >= point.X && e.Lower(point, points))
+ ++intersections;
+ }
+
+ return intersections % 2 == 1;
+}
diff --git a/library/cpp/reverse_geocoder/core/polygon.h b/library/cpp/reverse_geocoder/core/polygon.h
new file mode 100644
index 0000000000..065bba1e38
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/polygon.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include "bbox.h"
+#include "common.h"
+#include "edge.h"
+#include "part.h"
+#include "point.h"
+#include "region.h"
+
+namespace NReverseGeocoder {
+#pragma pack(push, 1)
+
+ struct TPolygonBase {
+ enum EType {
+ TYPE_UNKNOWN = 0,
+ TYPE_INNER = 1,
+ TYPE_OUTER = 2,
+ };
+
+ // If TYPE_INNER and polygon contains given point, this means that region with RegionId
+ // does not contains point.
+ EType Type;
+
+ ui32 Unused1;
+
+ // Geographical data indetifiers.
+ TGeoId RegionId;
+ TGeoId PolygonId;
+
+ // Rectangle in which lies that polygon.
+ TBoundingBox Bbox;
+
+ // Square of polygon. Need for determine which polygon is better. See better member function.
+ TSquare Square;
+
+ // Total points number of given polygon.
+ TNumber PointsNumber;
+
+ // Check that this polygon better then given polygon, which means that this polygons lying
+ // deeper then given in polygons hierarchy.
+ bool Better(const TPolygonBase& p, const TRegion* regions, TNumber regionsNumber) const;
+ };
+
+ // Polygon is a representation of persistent scanline data structure.
+ struct TPolygon: public TPolygonBase {
+ // Versions of persistent scanline.
+ TNumber PartsOffset;
+ TNumber PartsNumber;
+ ui32 Unused2;
+
+ // Fast point in polygon test using persistent scanline. You can see how this data structure
+ // generated in generator/.
+ bool Contains(const TPoint& point, const TPart* parts, const TRef* edgeRefs,
+ const TEdge* edges, const TPoint* points) const;
+ };
+
+ static_assert(sizeof(TPolygon) == 64, "NReverseGeocoder::TPolygon size mismatch");
+
+ // Raw polygon is a polygon representation for slow tests.
+ struct TRawPolygon: public TPolygonBase {
+ // Raw polygon edge refs.
+ TNumber EdgeRefsOffset;
+ TNumber EdgeRefsNumber;
+ ui32 Unused2;
+
+ bool Contains(const TPoint& point, const TRef* edgeRefs, const TEdge* edges,
+ const TPoint* points) const;
+ };
+
+ static_assert(sizeof(TRawPolygon) == 64, "NReverseGeocoder::TRawPolygon size mismatch");
+
+#pragma pack(pop)
+}
diff --git a/library/cpp/reverse_geocoder/core/region.cpp b/library/cpp/reverse_geocoder/core/region.cpp
new file mode 100644
index 0000000000..62b4acd0a1
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/region.cpp
@@ -0,0 +1 @@
+#include "region.h"
diff --git a/library/cpp/reverse_geocoder/core/region.h b/library/cpp/reverse_geocoder/core/region.h
new file mode 100644
index 0000000000..4b010c7103
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/region.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "common.h"
+
+namespace NReverseGeocoder {
+ struct Y_PACKED TRegion {
+ TGeoId RegionId;
+ TNumber KvsOffset;
+ TNumber KvsNumber;
+ TSquare Square;
+ TNumber PolygonsNumber;
+ ui32 Unused;
+
+ bool operator==(const TRegion& r) const {
+ return RegionId == r.RegionId;
+ }
+
+ bool operator<(const TRegion& r) const {
+ return RegionId < r.RegionId;
+ }
+
+ bool operator<(const TGeoId& r) const {
+ return RegionId < r;
+ }
+
+ friend bool operator<(const TGeoId& regionId, const TRegion& r) {
+ return regionId < r.RegionId;
+ }
+
+ bool Better(const TRegion& r) const {
+ return Square < r.Square;
+ }
+ };
+
+ static_assert(sizeof(TRegion) == 32, "NReverseGeocoder::TRegion size mismatch");
+
+}
diff --git a/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp b/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp
new file mode 100644
index 0000000000..d73e4f2648
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/reverse_geocoder.cpp
@@ -0,0 +1,182 @@
+#include "reverse_geocoder.h"
+#include "geo_data/geo_data.h"
+
+#include <library/cpp/reverse_geocoder/library/unaligned_iter.h>
+
+#include <util/generic/algorithm.h>
+#include <util/system/unaligned_mem.h>
+
+using namespace NReverseGeocoder;
+
+static bool PolygonContains(const TPolygon& p, const TPoint& point, const IGeoData& geoData) {
+ const TPart* parts = geoData.Parts();
+ const TRef* edgeRefs = geoData.EdgeRefs();
+ const TEdge* edges = geoData.Edges();
+ const TPoint* points = geoData.Points();
+ return p.Contains(point, parts, edgeRefs, edges, points);
+}
+
+template <typename TAnswer>
+static void UpdateAnswer(const TAnswer** answer, const TAnswer& polygon,
+ const IGeoData& geoData) {
+ if (!*answer) {
+ *answer = &polygon;
+ } else {
+ const TRegion* regions = geoData.Regions();
+ const TNumber regionsNumber = geoData.RegionsNumber();
+ if (!(*answer)->Better(polygon, regions, regionsNumber))
+ *answer = &polygon;
+ }
+}
+
+static void SortDebug(TReverseGeocoder::TDebug* debug, const IGeoData& geoData) {
+ const TRegion* regions = geoData.Regions();
+ const TNumber regionsNumber = geoData.RegionsNumber();
+
+ auto cmp = [&](const TGeoId& a, const TGeoId& b) {
+ const TRegion* r1 = LowerBound(regions, regions + regionsNumber, a);
+ const TRegion* r2 = LowerBound(regions, regions + regionsNumber, b);
+ return r1->Better(*r2);
+ };
+
+ Sort(debug->begin(), debug->end(), cmp);
+}
+
+TGeoId NReverseGeocoder::TReverseGeocoder::Lookup(const TLocation& location, TDebug* debug) const {
+ const IGeoData& geoData = *GeoDataProxy_->GeoData();
+
+ if (debug)
+ debug->clear();
+
+ const TPoint point(location);
+ const TRef boxRef = LookupAreaBox(point);
+
+ if (boxRef >= geoData.BoxesNumber())
+ return UNKNOWN_GEO_ID;
+
+ const TNumber refsOffset = geoData.Boxes()[boxRef].PolygonRefsOffset;
+ const TNumber refsNumber = geoData.Boxes()[boxRef].PolygonRefsNumber;
+
+ const TPolygon* answer = nullptr;
+
+ const TPolygon* p = geoData.Polygons();
+ const auto refsBegin = UnalignedIter(geoData.PolygonRefs()) + refsOffset;
+ const auto refsEnd = refsBegin + refsNumber;
+
+ for (auto iterL = refsBegin, iterR = refsBegin; iterL < refsEnd; iterL = iterR) {
+ iterR = iterL + 1;
+
+ if (PolygonContains(p[*iterL], point, geoData)) {
+ if (p[*iterL].Type == TPolygon::TYPE_INNER) {
+ // All polygons with same RegionId must be skipped if polygon is inner.
+ // In geoData small inner polygons stored before big outer polygons.
+ while (iterR < refsEnd && p[*iterL].RegionId == p[*iterR].RegionId)
+ ++iterR;
+
+ } else {
+ UpdateAnswer(&answer, p[*iterL], geoData);
+
+ if (debug)
+ debug->push_back(p[*iterL].RegionId);
+
+ while (iterR < refsEnd && p[*iterL].RegionId == p[*iterR].RegionId)
+ ++iterR;
+ }
+ }
+ }
+
+ if (debug)
+ SortDebug(debug, geoData);
+
+ return answer ? answer->RegionId : UNKNOWN_GEO_ID;
+}
+
+TGeoId NReverseGeocoder::TReverseGeocoder::RawLookup(const TLocation& location, TDebug* debug) const {
+ const IGeoData& geoData = *GeoDataProxy_->GeoData();
+
+ if (debug)
+ debug->clear();
+
+ const TPoint point(location);
+
+ const TRawPolygon* borders = geoData.RawPolygons();
+ const TNumber bordersNumber = geoData.RawPolygonsNumber();
+
+ const TRawPolygon* answer = nullptr;
+
+ TNumber i = 0;
+ while (i < bordersNumber) {
+ if (borders[i].Contains(point, geoData.RawEdgeRefs(), geoData.Edges(), geoData.Points())) {
+ if (borders[i].Type == TRawPolygon::TYPE_INNER) {
+ TNumber j = i + 1;
+ while (j < bordersNumber && borders[i].RegionId == borders[j].RegionId)
+ ++j;
+
+ i = j;
+
+ } else {
+ UpdateAnswer(&answer, borders[i], geoData);
+
+ if (debug)
+ debug->push_back(borders[i].RegionId);
+
+ TNumber j = i + 1;
+ while (j < bordersNumber && borders[i].RegionId == borders[j].RegionId)
+ ++j;
+
+ i = j;
+ }
+ } else {
+ ++i;
+ }
+ }
+
+ if (debug)
+ SortDebug(debug, geoData);
+
+ return answer ? answer->RegionId : UNKNOWN_GEO_ID;
+}
+
+bool NReverseGeocoder::TReverseGeocoder::EachKv(TGeoId regionId, TKvCallback callback) const {
+ const IGeoData& g = *GeoDataProxy_->GeoData();
+
+ const TRegion* begin = g.Regions();
+ const TRegion* end = begin + g.RegionsNumber();
+
+ const TRegion* region = LowerBound(begin, end, regionId);
+
+ if (region == end || region->RegionId != regionId)
+ return false;
+
+ const TKv* kvs = g.Kvs() + region->KvsOffset;
+ const char* blobs = g.Blobs();
+
+ for (TNumber i = 0; i < region->KvsNumber; ++i) {
+ const char* k = blobs + kvs[i].K;
+ const char* v = blobs + kvs[i].V;
+ callback(k, v);
+ }
+
+ return true;
+}
+
+void NReverseGeocoder::TReverseGeocoder::EachPolygon(TPolygonCallback callback) const {
+ const IGeoData& g = *GeoDataProxy_->GeoData();
+
+ for (TNumber i = 0; i < g.PolygonsNumber(); ++i)
+ callback(g.Polygons()[i]);
+}
+
+void NReverseGeocoder::TReverseGeocoder::EachPart(const TPolygon& polygon, TPartCallback callback) const {
+ const IGeoData& g = *GeoDataProxy_->GeoData();
+
+ const TNumber partsOffset = polygon.PartsOffset;
+ const TNumber partsNumber = polygon.PartsNumber;
+
+ for (TNumber i = partsOffset; i < partsOffset + partsNumber; ++i) {
+ const TPart& part = g.Parts()[i];
+ const TPart& npart = g.Parts()[i + 1];
+ const TNumber edgeRefsNumber = npart.EdgeRefsOffset - part.EdgeRefsOffset;
+ callback(part, edgeRefsNumber);
+ }
+}
diff --git a/library/cpp/reverse_geocoder/core/reverse_geocoder.h b/library/cpp/reverse_geocoder/core/reverse_geocoder.h
new file mode 100644
index 0000000000..c74eddb40e
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/reverse_geocoder.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include "common.h"
+#include "geo_data/geo_data.h"
+#include "geo_data/proxy.h"
+
+#include <util/generic/noncopyable.h>
+#include <util/generic/vector.h>
+
+#include <functional>
+
+namespace NReverseGeocoder {
+ const TGeoId UNKNOWN_GEO_ID = static_cast<TGeoId>(-1);
+
+ // NOTE: Be careful! It's work fine and fast on real world dataset.
+ // But in theory it's can spent O(n^2) memory (on real world dataset it's just 6n).
+ // Point in polygon test will be O(log n) always. Memory spent will be O(n) in future!
+ class TReverseGeocoder: public TNonCopyable {
+ public:
+ using TDebug = TVector<TGeoId>;
+ using TKvCallback = std::function<void(const char*, const char*)>;
+ using TPolygonCallback = std::function<void(const TPolygon&)>;
+ using TPartCallback = std::function<void(const TPart&, TNumber)>;
+
+ TReverseGeocoder()
+ : GeoDataProxy_()
+ {
+ }
+
+ TReverseGeocoder(TReverseGeocoder&& g)
+ : GeoDataProxy_()
+ {
+ DoSwap(GeoDataProxy_, g.GeoDataProxy_);
+ }
+
+ TReverseGeocoder& operator=(TReverseGeocoder&& g) {
+ DoSwap(GeoDataProxy_, g.GeoDataProxy_);
+ return *this;
+ }
+
+ explicit TReverseGeocoder(const char* path)
+ : GeoDataProxy_(new TGeoDataMapProxy(path))
+ {
+ }
+
+ explicit TReverseGeocoder(const IGeoData& geoData)
+ : GeoDataProxy_(new TGeoDataWrapper(geoData))
+ {
+ }
+
+ TReverseGeocoder(const char* data, size_t dataSize)
+ : GeoDataProxy_(new TGeoDataRawProxy(data, dataSize))
+ {
+ }
+
+ TGeoId Lookup(const TLocation& location, TDebug* debug = nullptr) const;
+
+ TGeoId RawLookup(const TLocation& location, TDebug* debug = nullptr) const;
+
+ bool EachKv(TGeoId regionId, TKvCallback callback) const;
+
+ void EachPolygon(TPolygonCallback callback) const;
+
+ void EachPart(const TPolygon& polygon, TPartCallback callback) const;
+
+ const IGeoData& GeoData() const {
+ return *GeoDataProxy_->GeoData();
+ }
+
+ private:
+ TGeoDataProxyPtr GeoDataProxy_;
+ };
+}
diff --git a/library/cpp/reverse_geocoder/core/ya.make b/library/cpp/reverse_geocoder/core/ya.make
new file mode 100644
index 0000000000..9f7dc67464
--- /dev/null
+++ b/library/cpp/reverse_geocoder/core/ya.make
@@ -0,0 +1,28 @@
+LIBRARY()
+
+PEERDIR(
+ library/cpp/reverse_geocoder/library
+ library/cpp/reverse_geocoder/proto
+ library/cpp/digest/crc32c
+)
+
+SRCS(
+ area_box.cpp
+ bbox.cpp
+ common.cpp
+ edge.cpp
+ reverse_geocoder.cpp
+ kv.cpp
+ location.cpp
+ part.cpp
+ point.cpp
+ polygon.cpp
+ region.cpp
+ geo_data/debug.cpp
+ geo_data/def.cpp
+ geo_data/geo_data.cpp
+ geo_data/map.cpp
+ geo_data/proxy.cpp
+)
+
+END()
diff --git a/library/cpp/reverse_geocoder/library/CMakeLists.darwin-x86_64.txt b/library/cpp/reverse_geocoder/library/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..f82b4b8cd1
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,21 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-reverse_geocoder-library)
+target_link_libraries(cpp-reverse_geocoder-library PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-reverse_geocoder-library PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/block_allocator.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/fs.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/log.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/pool_allocator.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/unaligned_iter.cpp
+)
diff --git a/library/cpp/reverse_geocoder/library/CMakeLists.linux-aarch64.txt b/library/cpp/reverse_geocoder/library/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..4b45fce452
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,22 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-reverse_geocoder-library)
+target_link_libraries(cpp-reverse_geocoder-library PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-reverse_geocoder-library PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/block_allocator.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/fs.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/log.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/pool_allocator.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/unaligned_iter.cpp
+)
diff --git a/library/cpp/reverse_geocoder/library/CMakeLists.linux-x86_64.txt b/library/cpp/reverse_geocoder/library/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..4b45fce452
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,22 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-reverse_geocoder-library)
+target_link_libraries(cpp-reverse_geocoder-library PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-reverse_geocoder-library PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/block_allocator.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/fs.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/log.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/pool_allocator.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/unaligned_iter.cpp
+)
diff --git a/library/cpp/reverse_geocoder/library/CMakeLists.txt b/library/cpp/reverse_geocoder/library/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/reverse_geocoder/library/CMakeLists.windows-x86_64.txt b/library/cpp/reverse_geocoder/library/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..f82b4b8cd1
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,21 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-reverse_geocoder-library)
+target_link_libraries(cpp-reverse_geocoder-library PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+)
+target_sources(cpp-reverse_geocoder-library PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/block_allocator.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/fs.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/log.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/pool_allocator.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/library/unaligned_iter.cpp
+)
diff --git a/library/cpp/reverse_geocoder/library/block_allocator.cpp b/library/cpp/reverse_geocoder/library/block_allocator.cpp
new file mode 100644
index 0000000000..56f61dc566
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/block_allocator.cpp
@@ -0,0 +1,40 @@
+#include "block_allocator.h"
+
+using namespace NReverseGeocoder;
+
+static size_t const MEMORY_IS_USED_FLAG = ~0ull;
+static size_t const SIZEOF_SIZE = AlignMemory(sizeof(size_t));
+
+void* NReverseGeocoder::TBlockAllocator::Allocate(size_t number) {
+ number = AlignMemory(number);
+ if (BytesAllocated_ + number + SIZEOF_SIZE > BytesLimit_)
+ ythrow yexception() << "Unable allocate memory";
+ char* begin = ((char*)Data_) + BytesAllocated_;
+ char* end = begin + number;
+ *((size_t*)end) = MEMORY_IS_USED_FLAG;
+ BytesAllocated_ += number + SIZEOF_SIZE;
+ return begin;
+}
+
+size_t NReverseGeocoder::TBlockAllocator::AllocateSize(size_t number) {
+ return AlignMemory(number) + SIZEOF_SIZE;
+}
+
+static void RelaxBlock(char* begin, size_t* number) {
+ while (*number > 0) {
+ char* ptr = begin + *number - SIZEOF_SIZE;
+ if (*((size_t*)ptr) == MEMORY_IS_USED_FLAG)
+ return;
+ *number -= *((size_t*)ptr) + SIZEOF_SIZE;
+ }
+}
+
+void NReverseGeocoder::TBlockAllocator::Deallocate(void* ptr, size_t number) {
+ number = AlignMemory(number);
+ char* begin = (char*)ptr;
+ char* end = begin + number;
+ if (*((size_t*)end) != MEMORY_IS_USED_FLAG)
+ ythrow yexception() << "Trying to deallocate not allocated pointer " << ptr;
+ *((size_t*)end) = number;
+ RelaxBlock((char*)Data_, &BytesAllocated_);
+}
diff --git a/library/cpp/reverse_geocoder/library/block_allocator.h b/library/cpp/reverse_geocoder/library/block_allocator.h
new file mode 100644
index 0000000000..1189d6b25c
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/block_allocator.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include "memory.h"
+
+#include <util/generic/yexception.h>
+
+namespace NReverseGeocoder {
+ class TBlockAllocator: public TNonCopyable {
+ public:
+ TBlockAllocator()
+ : Data_(nullptr)
+ , BytesAllocated_(0)
+ , BytesLimit_(0)
+ {
+ }
+
+ TBlockAllocator(void* data, size_t bytesLimit)
+ : Data_(data)
+ , BytesAllocated_(0)
+ , BytesLimit_(bytesLimit)
+ {
+ }
+
+ TBlockAllocator(TBlockAllocator&& a)
+ : TBlockAllocator()
+ {
+ DoSwap(Data_, a.Data_);
+ DoSwap(BytesAllocated_, a.BytesAllocated_);
+ DoSwap(BytesLimit_, a.BytesLimit_);
+ }
+
+ TBlockAllocator& operator=(TBlockAllocator&& a) {
+ DoSwap(Data_, a.Data_);
+ DoSwap(BytesAllocated_, a.BytesAllocated_);
+ DoSwap(BytesLimit_, a.BytesLimit_);
+ return *this;
+ }
+
+ virtual ~TBlockAllocator() {
+ }
+
+ virtual void* Allocate(size_t number);
+
+ static size_t AllocateSize(size_t number);
+
+ virtual void Deallocate(void* ptr, size_t number);
+
+ size_t TotalAllocatedSize() const {
+ return BytesAllocated_;
+ }
+
+ void Setup(void* data, size_t bytesLimit) {
+ Data_ = data;
+ BytesLimit_ = bytesLimit;
+ BytesAllocated_ = 0;
+ }
+
+ private:
+ void* Data_;
+ size_t BytesAllocated_;
+ size_t BytesLimit_;
+ };
+
+}
diff --git a/library/cpp/reverse_geocoder/library/fs.cpp b/library/cpp/reverse_geocoder/library/fs.cpp
new file mode 100644
index 0000000000..98c3b9ef81
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/fs.cpp
@@ -0,0 +1,18 @@
+#include "fs.h"
+
+#include <util/folder/dirut.h>
+#include <util/string/split.h>
+
+namespace NReverseGeocoder {
+ TVector<TString> GetDataFilesList(const char* input) {
+ if (IsDir(input)) {
+ return GetFileListInDirectory<TVector<TString>>(input);
+ }
+
+ TVector<TString> result;
+ for (const auto& partIt : StringSplitter(input).Split(',')) {
+ result.push_back(TString(partIt.Token()));
+ }
+ return result;
+ }
+}
diff --git a/library/cpp/reverse_geocoder/library/fs.h b/library/cpp/reverse_geocoder/library/fs.h
new file mode 100644
index 0000000000..4435f960c8
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/fs.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <util/folder/iterator.h>
+#include <util/string/vector.h>
+
+namespace NReverseGeocoder {
+ template <typename Cont>
+ Cont GetFileListInDirectory(const char* dirName) {
+ TDirIterator dirIt(dirName, TDirIterator::TOptions(FTS_LOGICAL));
+ Cont dirContent;
+ for (auto file = dirIt.begin(); file != dirIt.end(); ++file) {
+ if (strcmp(file->fts_path, dirName))
+ dirContent.push_back(file->fts_path);
+ }
+ return dirContent;
+ }
+
+ TVector<TString> GetDataFilesList(const char* input);
+}
diff --git a/library/cpp/reverse_geocoder/library/log.cpp b/library/cpp/reverse_geocoder/library/log.cpp
new file mode 100644
index 0000000000..44e6ddf287
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/log.cpp
@@ -0,0 +1,111 @@
+#include "log.h"
+
+#include <util/datetime/systime.h>
+#include <util/generic/yexception.h>
+#include <util/system/guard.h>
+#include <util/system/mutex.h>
+
+using namespace NReverseGeocoder;
+
+static size_t const TIMESTAMP_LIMIT = 32;
+
+class TLogger {
+public:
+ static TLogger& Inst() {
+ static TLogger logger;
+ return logger;
+ }
+
+ void Setup(IOutputStream& out, ELogLevel level) {
+ Out_ = &out;
+ Level_ = level;
+ }
+
+ void Write(ELogLevel level, const char* message) {
+ if (level <= Level_) {
+ TGuard<TMutex> Lock(Lock_);
+ Out_->Write(message, strlen(message));
+ }
+ }
+
+ IOutputStream& OutputStream() const {
+ return *Out_;
+ }
+
+ ELogLevel Level() const {
+ return Level_;
+ }
+
+private:
+ TLogger()
+ : Out_()
+ , Level_(LOG_LEVEL_DISABLE)
+ {
+ }
+
+ IOutputStream* Out_;
+ ELogLevel Level_;
+ TMutex Lock_;
+};
+
+ELogLevel NReverseGeocoder::LogLevel() {
+ return TLogger::Inst().Level();
+}
+
+void NReverseGeocoder::LogSetup(IOutputStream& out, ELogLevel level) {
+ TLogger::Inst().Setup(out, level);
+}
+
+IOutputStream& NReverseGeocoder::LogOutputStream() {
+ return TLogger::Inst().OutputStream();
+}
+
+static const char* T(char* buffer) {
+ struct timeval timeVal;
+ gettimeofday(&timeVal, nullptr);
+
+ struct tm timeInfo;
+ const time_t sec = timeVal.tv_sec;
+ localtime_r(&sec, &timeInfo);
+
+ snprintf(buffer, TIMESTAMP_LIMIT, "%02d:%02d:%02d.%06d",
+ timeInfo.tm_hour, timeInfo.tm_min, timeInfo.tm_sec, (int)timeVal.tv_usec);
+
+ return buffer;
+}
+
+void NReverseGeocoder::LogWrite(ELogLevel level, const char* message) {
+ if (level > LogLevel())
+ return;
+
+ static const char* A[LOG_LEVEL_COUNT] = {
+ "", // LOG_LEVEL_DISABLE
+ "\033[90m", // LOG_LEVEL_ERROR
+ "\033[90m", // LOG_LEVEL_WARNING
+ "\033[90m", // LOG_LEVEL_INFO
+ "\033[90m", // LOG_LEVEL_DEBUG
+ };
+
+ static const char* B[LOG_LEVEL_COUNT] = {
+ "", // LOG_LEVEL_DISABLE
+ "\033[31;1mError\033[0m", // LOG_LEVEL_ERROR
+ "\033[33;1mWarn\033[0m", // LOG_LEVEL_WARNING
+ "\033[32;1mInfo\033[0m", // LOG_LEVEL_INFO
+ "Debug", // LOG_LEVEL_DEBUG
+ };
+
+ static const char* C[LOG_LEVEL_COUNT] = {
+ "", // LOG_LEVEL_DISABLE
+ "\n", // LOG_LEVEL_ERROR
+ "\n", // LOG_LEVEL_WARNING
+ "\n", // LOG_LEVEL_INFO
+ "\033[0m\n", // LOG_LEVEL_DEBUG
+ };
+
+ char buffer[LOG_MESSAGE_LIMIT], tbuffer[TIMESTAMP_LIMIT];
+ // Ignore logger snprintf errors.
+ snprintf(buffer, LOG_MESSAGE_LIMIT, "%s(%s) %s: %s%s",
+ A[level], T(tbuffer), B[level], message, C[level]);
+
+ TLogger::Inst().Write(level, buffer);
+}
diff --git a/library/cpp/reverse_geocoder/library/log.h b/library/cpp/reverse_geocoder/library/log.h
new file mode 100644
index 0000000000..44cb0cefcf
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/log.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <util/generic/yexception.h>
+#include <util/stream/output.h>
+
+#include <cstdio>
+
+namespace NReverseGeocoder {
+ size_t const LOG_MESSAGE_LIMIT = 1024;
+
+ enum ELogLevel {
+ LOG_LEVEL_DISABLE = 0,
+ LOG_LEVEL_ERROR,
+ LOG_LEVEL_WARNING,
+ LOG_LEVEL_INFO,
+ LOG_LEVEL_DEBUG,
+ LOG_LEVEL_COUNT
+ };
+
+ // Init logger. Setup OutputStream and logger level.
+ void LogSetup(IOutputStream& out, ELogLevel level);
+
+ // Write log message with colors, level and current time.
+ // Example:
+ // (13:24:11.123456) Info: Good job!
+ // (13:24:11.323456) Warn: Ooops :(
+ // (13:24:22.456789) Error: Hello, world!
+ void LogWrite(ELogLevel level, const char* message);
+
+ // Log output file descriptor.
+ IOutputStream& LogOutputStream();
+
+ // Current log level.
+ ELogLevel LogLevel();
+
+ template <typename... TArgs>
+ void LogWrite(ELogLevel level, const char* fmt, TArgs... args) {
+ if (level <= LogLevel()) {
+ char buffer[LOG_MESSAGE_LIMIT];
+ // Ignore logger snprintf errors.
+ snprintf(buffer, LOG_MESSAGE_LIMIT, fmt, std::forward<TArgs>(args)...);
+ LogWrite(level, buffer);
+ }
+ }
+
+ template <typename... TArgs>
+ void LogError(TArgs... args) {
+ LogWrite(LOG_LEVEL_ERROR, std::forward<TArgs>(args)...);
+ }
+
+ template <typename... TArgs>
+ void LogWarning(TArgs... args) {
+ LogWrite(LOG_LEVEL_WARNING, std::forward<TArgs>(args)...);
+ }
+
+ template <typename... TArgs>
+ void LogInfo(TArgs... args) {
+ LogWrite(LOG_LEVEL_INFO, std::forward<TArgs>(args)...);
+ }
+
+ template <typename... TArgs>
+ void LogDebug(TArgs... args) {
+ LogWrite(LOG_LEVEL_DEBUG, std::forward<TArgs>(args)...);
+ }
+}
diff --git a/library/cpp/reverse_geocoder/library/memory.h b/library/cpp/reverse_geocoder/library/memory.h
new file mode 100644
index 0000000000..ecbe8bcd66
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/memory.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <util/system/types.h>
+
+namespace NReverseGeocoder {
+ constexpr ui64 B = 1ull;
+ constexpr ui64 KB = 1024 * B;
+ constexpr ui64 MB = 1024 * KB;
+ constexpr ui64 GB = 1024 * MB;
+
+ constexpr size_t MEMORY_ALIGNMENT = 16ull;
+
+ inline unsigned long long AlignMemory(unsigned long long x) {
+ if (x % MEMORY_ALIGNMENT == 0)
+ return x;
+ return x + MEMORY_ALIGNMENT - x % MEMORY_ALIGNMENT;
+ }
+
+ inline bool IsAlignedMemory(void* ptr) {
+ return ((uintptr_t)ptr) % MEMORY_ALIGNMENT == 0;
+ }
+
+}
diff --git a/library/cpp/reverse_geocoder/library/pool_allocator.cpp b/library/cpp/reverse_geocoder/library/pool_allocator.cpp
new file mode 100644
index 0000000000..0d841f7db0
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/pool_allocator.cpp
@@ -0,0 +1,17 @@
+#include "memory.h"
+#include "pool_allocator.h"
+
+#include <util/generic/yexception.h>
+
+using namespace NReverseGeocoder;
+
+NReverseGeocoder::TPoolAllocator::TPoolAllocator(size_t poolSize) {
+ Ptr_ = new char[poolSize];
+ Size_ = poolSize;
+ Setup(Ptr_, Size_);
+}
+
+NReverseGeocoder::TPoolAllocator::~TPoolAllocator() {
+ if (Ptr_)
+ delete[] Ptr_;
+}
diff --git a/library/cpp/reverse_geocoder/library/pool_allocator.h b/library/cpp/reverse_geocoder/library/pool_allocator.h
new file mode 100644
index 0000000000..f98bbcd3c1
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/pool_allocator.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "block_allocator.h"
+
+#include <util/generic/utility.h>
+#include <util/generic/noncopyable.h>
+
+namespace NReverseGeocoder {
+ class TPoolAllocator: public TBlockAllocator {
+ public:
+ TPoolAllocator()
+ : Ptr_(nullptr)
+ , Size_(0)
+ {
+ }
+
+ TPoolAllocator(TPoolAllocator&& a)
+ : TBlockAllocator(std::forward<TBlockAllocator>(a))
+ , Ptr_(nullptr)
+ , Size_(0)
+ {
+ DoSwap(Ptr_, a.Ptr_);
+ DoSwap(Size_, a.Size_);
+ }
+
+ TPoolAllocator& operator=(TPoolAllocator&& a) {
+ TBlockAllocator::operator=(std::forward<TBlockAllocator>(a));
+ DoSwap(Ptr_, a.Ptr_);
+ DoSwap(Size_, a.Size_);
+ return *this;
+ }
+
+ explicit TPoolAllocator(size_t poolSize);
+
+ ~TPoolAllocator() override;
+
+ private:
+ char* Ptr_;
+ size_t Size_;
+ };
+
+}
diff --git a/library/cpp/reverse_geocoder/library/system.h b/library/cpp/reverse_geocoder/library/system.h
new file mode 100644
index 0000000000..499fb2bd91
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/system.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#define SYSTEM_ENDIAN_FLAG (htonl(337))
diff --git a/library/cpp/reverse_geocoder/library/unaligned_iter.cpp b/library/cpp/reverse_geocoder/library/unaligned_iter.cpp
new file mode 100644
index 0000000000..0322b677dc
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/unaligned_iter.cpp
@@ -0,0 +1 @@
+#include "unaligned_iter.h"
diff --git a/library/cpp/reverse_geocoder/library/unaligned_iter.h b/library/cpp/reverse_geocoder/library/unaligned_iter.h
new file mode 100644
index 0000000000..827a3e2fd2
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/unaligned_iter.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <util/system/unaligned_mem.h>
+#include <iterator>
+
+namespace NReverseGeocoder {
+ /**
+ * Random-access iterator over a read-only memory range
+ * of trivially copyable items that may be not aligned properly.
+ *
+ * When dereferencing, a copy of item is returned, not a reference.
+ * Be sure that sizeof(T) is small enough.
+ *
+ * Iterator is useful for LowerBound/UpperBound STL algorithms.
+ */
+ template <class T>
+ class TUnalignedIter: public std::iterator<std::random_access_iterator_tag, T> {
+ public:
+ using TSelf = TUnalignedIter<T>;
+
+ explicit TUnalignedIter(const T* ptr)
+ : Ptr(ptr)
+ {
+ }
+
+ T operator*() const {
+ return ReadUnaligned<T>(Ptr);
+ }
+
+ bool operator==(TSelf other) const {
+ return Ptr == other.Ptr;
+ }
+
+ bool operator<(TSelf other) const {
+ return Ptr < other.Ptr;
+ }
+
+ TSelf operator+(ptrdiff_t delta) const {
+ return TSelf{Ptr + delta};
+ }
+
+ ptrdiff_t operator-(TSelf other) const {
+ return Ptr - other.Ptr;
+ }
+
+ TSelf& operator+=(ptrdiff_t delta) {
+ Ptr += delta;
+ return *this;
+ }
+
+ TSelf& operator++() {
+ ++Ptr;
+ return *this;
+ }
+
+ private:
+ const T* Ptr;
+ };
+
+ template <class T>
+ TUnalignedIter<T> UnalignedIter(const T* ptr) {
+ return TUnalignedIter<T>(ptr);
+ }
+}
diff --git a/library/cpp/reverse_geocoder/library/ya.make b/library/cpp/reverse_geocoder/library/ya.make
new file mode 100644
index 0000000000..ec2eb205a8
--- /dev/null
+++ b/library/cpp/reverse_geocoder/library/ya.make
@@ -0,0 +1,11 @@
+LIBRARY()
+
+SRCS(
+ block_allocator.cpp
+ fs.cpp
+ log.cpp
+ pool_allocator.cpp
+ unaligned_iter.cpp
+)
+
+END()
diff --git a/library/cpp/reverse_geocoder/proto/CMakeLists.darwin-x86_64.txt b/library/cpp/reverse_geocoder/proto/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..8d1df0fdf8
--- /dev/null
+++ b/library/cpp/reverse_geocoder/proto/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,56 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(cpp-reverse_geocoder-proto)
+target_link_libraries(cpp-reverse_geocoder-proto PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-protobuf
+)
+target_proto_messages(cpp-reverse_geocoder-proto PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/geo_data.proto
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/region.proto
+)
+target_proto_addincls(cpp-reverse_geocoder-proto
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(cpp-reverse_geocoder-proto
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/reverse_geocoder/proto/CMakeLists.linux-aarch64.txt b/library/cpp/reverse_geocoder/proto/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..b53c1692ee
--- /dev/null
+++ b/library/cpp/reverse_geocoder/proto/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,57 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(cpp-reverse_geocoder-proto)
+target_link_libraries(cpp-reverse_geocoder-proto PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-protobuf
+)
+target_proto_messages(cpp-reverse_geocoder-proto PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/geo_data.proto
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/region.proto
+)
+target_proto_addincls(cpp-reverse_geocoder-proto
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(cpp-reverse_geocoder-proto
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/reverse_geocoder/proto/CMakeLists.linux-x86_64.txt b/library/cpp/reverse_geocoder/proto/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..b53c1692ee
--- /dev/null
+++ b/library/cpp/reverse_geocoder/proto/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,57 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(cpp-reverse_geocoder-proto)
+target_link_libraries(cpp-reverse_geocoder-proto PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-protobuf
+)
+target_proto_messages(cpp-reverse_geocoder-proto PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/geo_data.proto
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/region.proto
+)
+target_proto_addincls(cpp-reverse_geocoder-proto
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(cpp-reverse_geocoder-proto
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/reverse_geocoder/proto/CMakeLists.txt b/library/cpp/reverse_geocoder/proto/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/reverse_geocoder/proto/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/reverse_geocoder/proto/CMakeLists.windows-x86_64.txt b/library/cpp/reverse_geocoder/proto/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..8d1df0fdf8
--- /dev/null
+++ b/library/cpp/reverse_geocoder/proto/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,56 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+get_built_tool_path(
+ TOOL_protoc_bin
+ TOOL_protoc_dependency
+ contrib/tools/protoc/bin
+ protoc
+)
+get_built_tool_path(
+ TOOL_cpp_styleguide_bin
+ TOOL_cpp_styleguide_dependency
+ contrib/tools/protoc/plugins/cpp_styleguide
+ cpp_styleguide
+)
+
+add_library(cpp-reverse_geocoder-proto)
+target_link_libraries(cpp-reverse_geocoder-proto PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ contrib-libs-protobuf
+)
+target_proto_messages(cpp-reverse_geocoder-proto PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/geo_data.proto
+ ${CMAKE_SOURCE_DIR}/library/cpp/reverse_geocoder/proto/region.proto
+)
+target_proto_addincls(cpp-reverse_geocoder-proto
+ ./
+ ${CMAKE_SOURCE_DIR}/
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_SOURCE_DIR}/contrib/libs/protobuf/src
+)
+target_proto_outs(cpp-reverse_geocoder-proto
+ --cpp_out=${CMAKE_BINARY_DIR}/
+ --cpp_styleguide_out=${CMAKE_BINARY_DIR}/
+)
diff --git a/library/cpp/reverse_geocoder/proto/geo_data.proto b/library/cpp/reverse_geocoder/proto/geo_data.proto
new file mode 100644
index 0000000000..00ecb48bec
--- /dev/null
+++ b/library/cpp/reverse_geocoder/proto/geo_data.proto
@@ -0,0 +1,42 @@
+package NReverseGeocoder.NProto;
+
+message TGeoData {
+ required uint64 Magic = 1;
+ required uint64 Version = 2;
+ optional uint64 Points = 3;
+ optional uint64 PointsNumber = 4;
+ optional uint64 PointsCrc32 = 5;
+ optional uint64 Edges = 6;
+ optional uint64 EdgesNumber = 7;
+ optional uint64 EdgesCrc32 = 8;
+ optional uint64 EdgeRefs = 9;
+ optional uint64 EdgeRefsNumber = 10;
+ optional uint64 EdgeRefsCrc32 = 11;
+ optional uint64 Parts = 12;
+ optional uint64 PartsNumber = 13;
+ optional uint64 PartsCrc32 = 14;
+ optional uint64 Polygons = 15;
+ optional uint64 PolygonsNumber = 16;
+ optional uint64 PolygonsCrc32 = 17;
+ optional uint64 PolygonRefs = 18;
+ optional uint64 PolygonRefsNumber = 19;
+ optional uint64 PolygonRefsCrc32 = 20;
+ optional uint64 Boxes = 21;
+ optional uint64 BoxesNumber = 22;
+ optional uint64 BoxesCrc32 = 23;
+ optional uint64 Blobs = 24;
+ optional uint64 BlobsNumber = 25;
+ optional uint64 BlobsCrc32 = 26;
+ optional uint64 Kvs = 27;
+ optional uint64 KvsNumber = 28;
+ optional uint64 KvsCrc32 = 29;
+ optional uint64 Regions = 30;
+ optional uint64 RegionsNumber = 31;
+ optional uint64 RegionsCrc32 = 32;
+ optional uint64 RawPolygons = 33;
+ optional uint64 RawPolygonsNumber = 34;
+ optional uint64 RawPolygonsCrc32 = 35;
+ optional uint64 RawEdgeRefs = 36;
+ optional uint64 RawEdgeRefsNumber = 37;
+ optional uint64 RawEdgeRefsCrc32 = 38;
+};
diff --git a/library/cpp/reverse_geocoder/proto/region.proto b/library/cpp/reverse_geocoder/proto/region.proto
new file mode 100644
index 0000000000..b782331628
--- /dev/null
+++ b/library/cpp/reverse_geocoder/proto/region.proto
@@ -0,0 +1,32 @@
+package NReverseGeocoder.NProto;
+
+message TLocation {
+ required double Lat = 1;
+ required double Lon = 2;
+}
+
+message TPolygon {
+ required uint64 PolygonId = 1;
+ repeated TLocation Locations = 2;
+
+ enum EType {
+ TYPE_UNKNOWN = 0;
+ TYPE_INNER = 1;
+ TYPE_OUTER = 2;
+ }
+
+ required EType Type = 3;
+}
+
+message TKv {
+ required string K = 1;
+ required string V = 2;
+}
+
+message TRegion {
+ required uint64 RegionId = 1;
+ optional uint64 ParentId = 2;
+ repeated TPolygon Polygons = 3;
+ repeated TKv Kvs = 4;
+ repeated string Blobs = 5;
+}
diff --git a/library/cpp/reverse_geocoder/proto/ya.make b/library/cpp/reverse_geocoder/proto/ya.make
new file mode 100644
index 0000000000..b6f7156210
--- /dev/null
+++ b/library/cpp/reverse_geocoder/proto/ya.make
@@ -0,0 +1,10 @@
+PROTO_LIBRARY()
+
+SRCS(
+ geo_data.proto
+ region.proto
+)
+
+EXCLUDE_TAGS(GO_PROTO)
+
+END()
diff --git a/library/cpp/robots_txt/CMakeLists.darwin-x86_64.txt b/library/cpp/robots_txt/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..408bf12f04
--- /dev/null
+++ b/library/cpp/robots_txt/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,26 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+add_subdirectory(robotstxtcfg)
+
+add_library(library-cpp-robots_txt)
+target_link_libraries(library-cpp-robots_txt PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-robots_txt-robotstxtcfg
+ library-cpp-case_insensitive_string
+ library-cpp-charset
+ cpp-string_utils-url
+ library-cpp-uri
+)
+target_sources(library-cpp-robots_txt PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree_rules_handler.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robots_txt_parser.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/rules_handler.cpp
+)
diff --git a/library/cpp/robots_txt/CMakeLists.linux-aarch64.txt b/library/cpp/robots_txt/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..73a209cbbe
--- /dev/null
+++ b/library/cpp/robots_txt/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,27 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+add_subdirectory(robotstxtcfg)
+
+add_library(library-cpp-robots_txt)
+target_link_libraries(library-cpp-robots_txt PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ cpp-robots_txt-robotstxtcfg
+ library-cpp-case_insensitive_string
+ library-cpp-charset
+ cpp-string_utils-url
+ library-cpp-uri
+)
+target_sources(library-cpp-robots_txt PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree_rules_handler.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robots_txt_parser.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/rules_handler.cpp
+)
diff --git a/library/cpp/robots_txt/CMakeLists.linux-x86_64.txt b/library/cpp/robots_txt/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..73a209cbbe
--- /dev/null
+++ b/library/cpp/robots_txt/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,27 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+add_subdirectory(robotstxtcfg)
+
+add_library(library-cpp-robots_txt)
+target_link_libraries(library-cpp-robots_txt PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ cpp-robots_txt-robotstxtcfg
+ library-cpp-case_insensitive_string
+ library-cpp-charset
+ cpp-string_utils-url
+ library-cpp-uri
+)
+target_sources(library-cpp-robots_txt PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree_rules_handler.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robots_txt_parser.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/rules_handler.cpp
+)
diff --git a/library/cpp/robots_txt/CMakeLists.txt b/library/cpp/robots_txt/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/robots_txt/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/robots_txt/CMakeLists.windows-x86_64.txt b/library/cpp/robots_txt/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..408bf12f04
--- /dev/null
+++ b/library/cpp/robots_txt/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,26 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+add_subdirectory(robotstxtcfg)
+
+add_library(library-cpp-robots_txt)
+target_link_libraries(library-cpp-robots_txt PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ cpp-robots_txt-robotstxtcfg
+ library-cpp-case_insensitive_string
+ library-cpp-charset
+ cpp-string_utils-url
+ library-cpp-uri
+)
+target_sources(library-cpp-robots_txt PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/prefix_tree_rules_handler.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robots_txt_parser.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/rules_handler.cpp
+)
diff --git a/library/cpp/robots_txt/constants.h b/library/cpp/robots_txt/constants.h
new file mode 100644
index 0000000000..e5e2a57e18
--- /dev/null
+++ b/library/cpp/robots_txt/constants.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <util/generic/size_literals.h>
+#include <util/system/defaults.h>
+
+
+constexpr auto robots_max = 500_KB;
+constexpr auto max_rules_count = 10'000;
+constexpr auto max_rule_length = 10_KB;
diff --git a/library/cpp/robots_txt/prefix_tree.cpp b/library/cpp/robots_txt/prefix_tree.cpp
new file mode 100644
index 0000000000..f7b1848a43
--- /dev/null
+++ b/library/cpp/robots_txt/prefix_tree.cpp
@@ -0,0 +1,172 @@
+#include <cstring>
+#include <algorithm>
+
+#include "prefix_tree.h"
+
+TPrefixTreeNodeElement::TPrefixTreeNodeElement()
+ : Key(nullptr)
+ , KeyLen(0)
+ , Val(-1)
+ , Index(-1)
+{
+}
+
+TPrefixTreeNodeElement::TPrefixTreeNodeElement(const char* key, i32 keyLen = 0, i32 val = -1, i32 index = -1)
+ : Key(key)
+ , KeyLen(keyLen)
+ , Val(val)
+ , Index(index)
+{
+}
+
+TPrefixTreeNode::TPrefixTreeNode()
+ : Elements()
+{
+}
+
+int TPrefixTreeNode::Find(char ch) const {
+ for (size_t i = 0; i < Elements.size(); ++i)
+ if (ch == *(Elements[i].Key))
+ return i;
+ return -1;
+}
+
+void TPrefixTreeNode::Set(const char* key, i32 keyLen, i32 val, i32 index) {
+ TPrefixTreeNodeElement element(key, keyLen, val, index);
+ int i = Find(*key);
+ if (i < 0)
+ Elements.push_back(element);
+ else
+ Elements[i] = element;
+}
+
+void TPrefixTreeNode::Dump(FILE* logFile) const {
+ if (!logFile)
+ logFile = stderr;
+ fprintf(logFile, "size=%" PRISZT "\n", Elements.size());
+ static char b[1234];
+ for (size_t i = 0; i < Elements.size(); ++i) {
+ strncpy(b, Elements[i].Key, Elements[i].KeyLen);
+ b[Elements[i].KeyLen] = 0;
+ fprintf(logFile, "{key=[%s]:%d, val=%d, index=%d}\n", b, Elements[i].KeyLen, Elements[i].Val, Elements[i].Index);
+ }
+}
+
+void TPrefixTree::Dump(FILE* logFile) const {
+ if (!logFile)
+ logFile = stderr;
+ fprintf(logFile, "%" PRISZT " nodes\n", Nodes.size());
+ for (size_t i = 0; i < Nodes.size(); ++i) {
+ fprintf(logFile, "%" PRISZT ": ", i);
+ Nodes[i].Dump(logFile);
+ fprintf(logFile, "\n");
+ }
+}
+
+TPrefixTree::TPrefixTree(int maxSize) {
+ Init(maxSize);
+}
+
+void TPrefixTree::Init(int maxSize) {
+ Nodes.clear();
+ Nodes.reserve(std::max(maxSize + 1, 1));
+ Nodes.push_back(TPrefixTreeNode());
+}
+
+void TPrefixTree::Clear() {
+ Nodes.clear();
+ Init(0);
+}
+
+void TPrefixTree::Add(const char* s, i32 index) {
+ AddInternal(s, Nodes[0], index);
+}
+
+void TPrefixTree::AddInternal(const char* s, TPrefixTreeNode& node, i32 index) {
+ if (!s || !*s)
+ return;
+
+ int i = node.Find(*s);
+ if (i >= 0) {
+ TPrefixTreeNodeElement& d = node.Elements[i];
+ const char* p = d.Key;
+ while (*s && (p - d.Key) < d.KeyLen && *s == *p)
+ ++s, ++p;
+
+ if (*s) {
+ if ((p - d.Key) < d.KeyLen) {
+ Nodes.push_back(TPrefixTreeNode());
+ Nodes.back().Set(p, d.KeyLen - (p - d.Key), d.Val, d.Index);
+ Nodes.back().Set(s, strlen(s), -1, index);
+
+ d.Val = Nodes.size() - 1;
+ d.KeyLen = p - d.Key;
+ d.Index = INDEX_BOUND;
+ } else {
+ if (d.Val != -1 && index < d.Index)
+ AddInternal(s, Nodes[d.Val], index);
+ }
+ } else {
+ if ((p - d.Key) < d.KeyLen) {
+ Nodes.push_back(TPrefixTreeNode());
+ Nodes.back().Set(p, d.KeyLen - (p - d.Key), d.Val, d.Index);
+ d.Val = Nodes.size() - 1;
+ d.KeyLen = p - d.Key;
+ d.Index = index;
+ } else {
+ d.Index = std::min(d.Index, index);
+ }
+ }
+ } else {
+ node.Set(s, strlen(s), -1, index);
+ }
+}
+
+int TPrefixTree::GetMemorySize() const {
+ int res = Nodes.capacity() * sizeof(TPrefixTreeNode);
+ for (size_t i = 0; i < Nodes.size(); ++i)
+ res += Nodes[i].Elements.capacity() * sizeof(TPrefixTreeNodeElement);
+ return res;
+}
+
+void TPrefixTree::Compress() {
+ Nodes.shrink_to_fit();
+ for (size_t i = 0; i < Nodes.size(); ++i)
+ Nodes[i].Elements.shrink_to_fit();
+}
+
+i32 TPrefixTree::MinPrefixIndex(const char* s) const {
+ if (!*s)
+ return -1;
+ int i = Nodes[0].Find(*s);
+ if (i < 0)
+ return -1;
+ const TPrefixTreeNodeElement* d = &Nodes[0].Elements[i];
+
+ const char* p = d->Key;
+ if (!p || !*p)
+ return -1;
+
+ i32 result = INDEX_BOUND;
+ i32 nodeIndex = 0;
+ while (*s == *p) {
+ if (++p - d->Key >= d->KeyLen)
+ result = std::min(result, d->Index);
+ if (!*++s)
+ break;
+
+ if (p - d->Key >= d->KeyLen) {
+ nodeIndex = d->Val;
+ if (nodeIndex == -1)
+ break;
+ i = Nodes[nodeIndex].Find(*s);
+ if (i < 0)
+ break;
+ d = &Nodes[nodeIndex].Elements[i];
+ p = d->Key;
+ if (!p || !*p)
+ break;
+ }
+ }
+ return result < INDEX_BOUND ? result : -1;
+}
diff --git a/library/cpp/robots_txt/prefix_tree.h b/library/cpp/robots_txt/prefix_tree.h
new file mode 100644
index 0000000000..5feafcb74d
--- /dev/null
+++ b/library/cpp/robots_txt/prefix_tree.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <util/generic/ptr.h>
+#include <util/generic/vector.h>
+#include <cstdio>
+#include <util/generic/noncopyable.h>
+
+struct TPrefixTreeNodeElement {
+ const char* Key;
+ i32 KeyLen;
+ i32 Val;
+ i32 Index;
+
+ TPrefixTreeNodeElement();
+ TPrefixTreeNodeElement(const char*, i32, i32, i32);
+};
+
+class TPrefixTreeNode {
+public:
+ TVector<TPrefixTreeNodeElement> Elements;
+ TPrefixTreeNode();
+
+ int Find(char) const;
+ void Set(const char*, i32, i32, i32);
+ void Dump(FILE*) const;
+};
+
+class TPrefixTree : TNonCopyable {
+private:
+ static const i32 INDEX_BOUND = 1 << 30;
+
+ TVector<TPrefixTreeNode> Nodes;
+
+public:
+ void Init(int);
+ TPrefixTree(int);
+
+ void Add(const char*, i32);
+ i32 MinPrefixIndex(const char*) const;
+ void Clear();
+ void Dump(FILE*) const;
+ int GetMemorySize() const;
+ void Compress();
+
+private:
+ void AddInternal(const char*, TPrefixTreeNode&, i32);
+};
diff --git a/library/cpp/robots_txt/prefix_tree_rules_handler.cpp b/library/cpp/robots_txt/prefix_tree_rules_handler.cpp
new file mode 100644
index 0000000000..8dd579d060
--- /dev/null
+++ b/library/cpp/robots_txt/prefix_tree_rules_handler.cpp
@@ -0,0 +1,706 @@
+#include "robots_txt.h"
+
+#include <util/digest/fnv.h>
+#include <util/system/tls.h>
+#include <util/generic/buffer.h>
+#include <util/generic/yexception.h>
+
+namespace {
+
+TString NormalizeRule(TStringBuf rule) {
+ TString result;
+ result.reserve(rule.size() + 1);
+
+ // remove consecutive '*'
+ for (auto c : rule) {
+ if (c != '*' || !result.EndsWith('*')) {
+ result.append(c);
+ }
+ }
+
+ if (rule == "*") {
+ result = "/*";
+ return result;
+ }
+
+ // unify suffix
+ if (result.EndsWith('$')) {
+ result.pop_back();
+ } else if (!result.EndsWith('*')) {
+ result.append('*');
+ }
+
+ return result;
+}
+
+// Prefix rules
+bool IsPrefixRule(TStringBuf rule) {
+ return rule.EndsWith('*') && !TStringBuf(rule.begin(), rule.end() - 1).Contains('*');
+}
+
+// Converts rule to internal representation, i.e.
+// For prefix rules: "/foo", 'D' -> 'D', "/foo"
+// For generic rules: "/*foo", 'D' -> ("/*/*foo*", 'd') or ("/*foo$", 'A') -> ("/*foo", 'a')
+// The distinction is in uppercase/lowercase rule type
+std::pair<TString, char> ConvertRule(TStringBuf rule, char type) {
+ switch (type) {
+ case 'H':
+ case 'S':
+ case 'C':
+ case 'P':
+ return {TString(rule), type};
+ case 'A':
+ case 'D':
+ break;
+ default:
+ return {{}, type};
+ }
+
+ auto result = NormalizeRule(rule);
+ if (IsPrefixRule(result)) {
+ result.pop_back(); // remove extra '*' from the end
+ } else {
+ type = tolower(type);
+ }
+
+ return {std::move(result), type};
+}
+
+} // namespace
+
+TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler(
+ TBotIdSet supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : TRobotsTxtRulesHandlerBase(supportedBotIds, robotsMaxSize, maxRulesNumber, saveDataForAnyBot)
+{}
+
+TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler(
+ std::initializer_list<ui32> supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : TRobotsTxtRulesHandlerBase(TBotIdSet(supportedBotIds), robotsMaxSize, maxRulesNumber, saveDataForAnyBot)
+{}
+
+TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeRobotsTxtRulesHandler(
+ const TSet<ui32>& supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : TRobotsTxtRulesHandlerBase(supportedBotIds, robotsMaxSize, maxRulesNumber, saveDataForAnyBot)
+{}
+
+bool TPrefixTreeRobotsTxtRulesHandler::Empty(const ui32 botId) const {
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)];
+ return !botInfo || (botInfo->BufferPosition <= sizeof(botInfo->BufferPosition));
+}
+
+TRobotsTxtRulesIterator TPrefixTreeRobotsTxtRulesHandler::GetRulesIterator(const ui32 botId) const {
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)];
+ if (!botInfo) {
+ return {};
+ }
+ return TRobotsTxtRulesIterator(botInfo->Buffer.Get() + sizeof(botInfo->BufferPosition), botInfo->Buffer.Get() + botInfo->BufferPosition);
+}
+
+size_t TPrefixTreeRobotsTxtRulesHandler::GetMemorySize() {
+ size_t allBotsSize = 0;
+ for (const auto& botInfo : BotIdToPrefixTreeBotInfo) {
+ if (!botInfo) {
+ continue;
+ }
+
+ allBotsSize += botInfo->PrefixRules.GetMemorySize()
+ + botInfo->BufferSize * sizeof(char)
+ + botInfo->ComplexRulesSize * sizeof(char**)
+ + botInfo->RulesSize * sizeof(char*) + (1 << 8);
+ }
+ return allBotsSize;
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::ClearInternal(const ui32 botId) {
+ if (botId >= BotIdToPrefixTreeBotInfo.size()) {
+ return;
+ }
+ BotIdToPrefixTreeBotInfo[botId].Reset();
+ TRobotsTxtRulesHandlerBase::ClearInternal(botId);
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::OptimizeSize() {
+ ResetOptimized();
+
+ TMap<ui64, ui32> hashToBotId;
+ for (auto botId : LoadedBotIds) {
+ auto& botInfo = BotIdToPrefixTreeBotInfo[botId];
+ if (botInfo->BufferPosition <= sizeof(ui32)) {
+ botInfo.Reset();
+ LoadedBotIds.remove(botId);
+ continue;
+ }
+
+ ui64 hash = FnvHash<ui64>(botInfo->Buffer.Get(), botInfo->BufferPosition);
+ if (auto p = hashToBotId.FindPtr(hash)) {
+ OptimizedBotIdToStoredBotId[botId] = *p;
+ ClearInternal(botId);
+ botInfo.Reset();
+ } else {
+ hashToBotId[hash] = botId;
+ }
+ }
+
+ if (IsFullTotal()) {
+ DoAllowAll();
+ return false;
+ }
+
+ return true;
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::Clear() {
+ for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId)
+ if (IsBotIdSupported(botId))
+ ClearInternal(botId);
+ TRobotsTxtRulesHandlerBase::Clear();
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::ResizeBuffer(const ui32 botId, int newSize) {
+ auto& botInfo = GetInfo(botId);
+ TArrayHolder<char> newBuffer(new char[newSize]);
+ memcpy(newBuffer.Get(), botInfo.Buffer.Get(), std::min(botInfo.BufferSize, newSize));
+ botInfo.Buffer.Swap(newBuffer);
+ botInfo.BufferSize = newSize;
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::AddRule(const ui32 botId, TStringBuf rule, char type) {
+ if (rule.empty() || rule.Contains('\0')) {
+ return true;
+ }
+
+ auto& botInfo = GetInfo(botId);
+
+ if (IsFull(botId, rule.size())) {
+ DoAllowAll();
+ return false;
+ }
+
+ auto [convertedRule, convertedType] = ConvertRule(rule, type);
+ const auto len = convertedRule.size() + 2; // 1 byte for convertedType and another for '\0'
+
+ if (auto newPos = botInfo.BufferPosition + len; newPos >= size_t(botInfo.BufferSize)) {
+ size_t newSize = botInfo.BufferSize;
+ while (newPos >= newSize)
+ newSize *= 2;
+ ResizeBuffer(botId, newSize);
+ }
+
+ auto out = botInfo.Buffer.Get() + botInfo.BufferPosition;
+ *out++ = convertedType;
+ strcpy(out, convertedRule.data());
+ botInfo.BufferPosition += len;
+
+ if (type == 'A' || type == 'D') {
+ botInfo.RulesPosition++;
+ }
+
+ return true;
+}
+
+const char* TPrefixTreeRobotsTxtRulesHandler::GetRule(const ui32 botId, const char* s, char type) const {
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)];
+ if (!botInfo) {
+ return nullptr;
+ }
+
+ int m = botInfo->RulesPosition + 1;
+ int k = botInfo->PrefixRules.MinPrefixIndex(s);
+ if (k >= 0)
+ m = k;
+ char* rule;
+ int j;
+ for (int i = 0; i < botInfo->ComplexRulesPosition; ++i) {
+ rule = *botInfo->ComplexRules.Get()[i];
+ j = botInfo->ComplexRules.Get()[i] - botInfo->Rules.Get();
+ if (j >= m)
+ break;
+ if (CheckRule(s, rule)) {
+ m = j;
+ break;
+ }
+ }
+ if (m >= botInfo->RulesPosition)
+ return nullptr;
+ return toupper(*(botInfo->Rules.Get()[m] - 1)) == type ? botInfo->Rules.Get()[m] : nullptr;
+}
+
+inline bool TPrefixTreeRobotsTxtRulesHandler::IsAllowAll(const ui32 botId) const {
+ const auto id = GetMappedBotId(botId, false);
+ auto& botInfo = BotIdToPrefixTreeBotInfo[id ? *id : robotstxtcfg::id_anybot];
+ return botInfo && botInfo->AllowAll;
+}
+
+inline bool TPrefixTreeRobotsTxtRulesHandler::IsAllowAll() const {
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId)
+ if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsAllowAll(botId)) {
+ return false;
+ }
+
+ return true;
+}
+
+inline bool TPrefixTreeRobotsTxtRulesHandler::IsDisallowAll(const ui32 botId, bool useAny) const {
+ const auto id = GetMappedBotId(botId, false);
+ if (id) {
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[*id];
+ return botInfo && botInfo->DisallowAll;
+ }
+
+ auto& botInfo = BotIdToPrefixTreeBotInfo[robotstxtcfg::id_anybot];
+ return useAny && botInfo && botInfo->DisallowAll;
+}
+
+inline bool TPrefixTreeRobotsTxtRulesHandler::IsDisallowAll() const {
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId)
+ if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsDisallowAll(botId))
+ return false;
+
+ return true;
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::DoAllowAll() {
+ using robotstxtcfg::id_anybot;
+
+ // Drop all bots to default
+ SupportedBotIds.insert(id_anybot);
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (IsBotIdSupported(botId)) {
+ ClearInternal(botId);
+ OptimizedBotIdToStoredBotId[botId] = id_anybot;
+ LoadedBotIds.insert(botId);
+ }
+ }
+
+ // Initialize anybot with "allow all" rule
+ AddRule(id_anybot, "/", 'A');
+ GetInfo(id_anybot).AllowAll = true;
+ SaveRulesToBuffer();
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::DoDisallowAll() {
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (!IsBotIdSupported(botId))
+ continue;
+ ClearInternal(botId);
+ if (botId == robotstxtcfg::id_anybot) {
+ auto& botInfo = GetInfo(botId);
+ AddRule(botId, "/", 'D');
+ botInfo.DisallowAll = true;
+ SaveRulesToBuffer();
+ } else {
+ OptimizedBotIdToStoredBotId[botId] = robotstxtcfg::id_anybot;
+ }
+ LoadedBotIds.insert(botId);
+ }
+}
+
+const char* TPrefixTreeRobotsTxtRulesHandler::IsDisallow(const ui32 botId, const char* s, bool useAny) const {
+ const auto id = GetMappedBotId(botId, useAny);
+ if (!id)
+ return nullptr;
+
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[*id];
+ if (botInfo && IsDisallowAll(*id, useAny)) {
+ int index = (const_cast<TPrefixTreeRobotsTxtRulesHandler*>(this))->FindRuleAll(*botInfo, 'D');
+ if (index < 0) { //o_O
+ return botInfo->Rules.Get()[0];
+ } else {
+ return botInfo->Rules.Get()[index];
+ }
+ }
+
+ return GetRule(*id, s, 'D');
+}
+
+const char* TPrefixTreeRobotsTxtRulesHandler::IsAllow(const ui32 botId, const char* s) const {
+ const auto id = GetMappedBotId(botId, true);
+ if (auto p = GetRule(*id, s, 'A'))
+ return p;
+ return GetRule(*id, s, 'D') ? nullptr : "/";
+}
+
+int TPrefixTreeRobotsTxtRulesHandler::StrLenWithoutStars(const char* s) {
+ int len = 0;
+
+ for (size_t index = 0; s[index]; ++index) {
+ if (s[index] != '*') {
+ ++len;
+ }
+ }
+
+ return len;
+}
+
+int TPrefixTreeRobotsTxtRulesHandler::TraceBuffer(const ui32 botId, int countRules, const TArrayHolder<TRuleInfo>* ruleInfos) {
+ CheckBotIdValidity(botId);
+ auto& prefixBotInfo = GetInfo(botId);
+ TBotInfo& botInfo = BotIdToInfo[botId];
+
+ bool store = countRules >= 0;
+ if (store) {
+ prefixBotInfo.Rules.Reset(new char*[prefixBotInfo.RulesSize = countRules]);
+ }
+
+ int beg = -1, n = 0;
+ *((int*)prefixBotInfo.Buffer.Get()) = prefixBotInfo.BufferSize;
+ for (size_t i = sizeof(prefixBotInfo.BufferPosition); i < prefixBotInfo.BufferPosition; ++i)
+ if (prefixBotInfo.Buffer.Get()[i] == '\n' || prefixBotInfo.Buffer.Get()[i] == 0) {
+ if (beg < 0 || beg + 1 == (int)i)
+ continue;
+
+ char* s = prefixBotInfo.Buffer.Get() + beg;
+ if (store) {
+ switch (*s) {
+ case 'H':
+ HostDirective = s + 1;
+ break;
+ case 'S':
+ SiteMaps.insert(s + 1);
+ break;
+ case 'C':
+ ParseCrawlDelay(s + 1, botInfo.CrawlDelay);
+ break;
+ case 'P':
+ CleanParams.insert(s + 1);
+ break;
+ default:
+ prefixBotInfo.Rules.Get()[n] = s + 1;
+ (*ruleInfos).Get()[n].Len = StrLenWithoutStars(s + 1);
+ (*ruleInfos).Get()[n].Allow = toupper(*s) == 'A';
+
+ prefixBotInfo.HasAllow |= toupper(*s) == 'A';
+ prefixBotInfo.HasDisallow |= toupper(*s) == 'D';
+ break;
+ }
+ }
+ n += (*s != 'H' && *s != 'S' && *s != 'C' && *s != 'P');
+ beg = -1;
+ } else if (beg < 0)
+ beg = i;
+
+ return n;
+}
+
+int TPrefixTreeRobotsTxtRulesHandler::FindRuleAll(const TPrefixTreeBotInfo& prefixBotInfo, const char neededType) {
+ static const char* all[] = {"*", "/", "*/", "/*", "*/*"};
+ for (int ruleNumber = prefixBotInfo.RulesSize - 1; ruleNumber >= 0; --ruleNumber) {
+ const char* curRule = prefixBotInfo.Rules.Get()[ruleNumber];
+ char ruleType = *(curRule - 1);
+
+ if (strlen(curRule) > 3)
+ break;
+ if (neededType != ruleType)
+ continue;
+
+ for (size_t i = 0; i < sizeof(all) / sizeof(char*); ++i)
+ if (strcmp(all[i], curRule) == 0)
+ return ruleNumber;
+ }
+ return -1;
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::HasDisallowRulePrevAllowAll(const TPrefixTreeBotInfo& prefixBotInfo, int ruleAllAllow) {
+ for (int ruleNumber = ruleAllAllow - 1; ruleNumber >= 0; --ruleNumber) {
+ const char* curRule = prefixBotInfo.Rules.Get()[ruleNumber];
+ char ruleType = *(curRule - 1);
+ if (tolower(ruleType) == 'd')
+ return true;
+ }
+ return false;
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::CheckAllowDisallowAll(const ui32 botId, const bool checkDisallow) {
+ CheckBotIdValidity(botId);
+
+ auto& botInfo = GetInfo(botId);
+
+ if (botInfo.RulesSize == 0)
+ return !checkDisallow;
+ if (botInfo.RulesPosition <= 0)
+ return 0;
+
+ if (checkDisallow)
+ return !botInfo.HasAllow && FindRuleAll(botInfo, 'D') >= 0;
+ int ruleAllAllow = FindRuleAll(botInfo, 'A');
+ if (ruleAllAllow == -1)
+ return !botInfo.HasDisallow;
+ return !HasDisallowRulePrevAllowAll(botInfo, ruleAllAllow);
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::SortRules(
+ TPrefixTreeBotInfo& prefixBotInfo,
+ size_t count,
+ const TArrayHolder<TRuleInfo>* ruleInfos) {
+ TVector<size_t> indexes(count);
+ for (size_t index = 0; index < count; ++index)
+ indexes[index] = index;
+
+ TRulesSortFunc sortFunc(ruleInfos);
+ std::sort(indexes.begin(), indexes.end(), sortFunc);
+
+ TArrayHolder<char*> workingCopy;
+ workingCopy.Reset(new char*[count]);
+
+ for (size_t index = 0; index < count; ++index)
+ workingCopy.Get()[index] = prefixBotInfo.Rules.Get()[index];
+ for (size_t index = 0; index < count; ++index)
+ prefixBotInfo.Rules.Get()[index] = workingCopy.Get()[indexes[index]];
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::SaveRulesToBuffer() {
+ // as sitemaps, clean-params and HostDirective from prefix tree was deleted
+ for (const auto& sitemap: SiteMaps)
+ AddRule(robotstxtcfg::id_anybot, sitemap, 'S');
+ for (const auto& param : CleanParams)
+ AddRule(robotstxtcfg::id_anybot, param, 'P');
+ if (!HostDirective.empty())
+ AddRule(robotstxtcfg::id_anybot, HostDirective, 'H');
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::SaveRulesFromBuffer(const ui32 botId) {
+ CheckBotIdValidity(botId);
+
+ auto& botInfo = GetInfo(botId);
+
+ TArrayHolder<TRuleInfo> ruleInfos;
+
+ int n = TraceBuffer(botId, -1, nullptr), countPrefix = 0;
+ ruleInfos.Reset(new TRuleInfo[n]);
+ botInfo.RulesPosition = TraceBuffer(botId, n, &ruleInfos);
+ assert(botInfo.RulesPosition == n);
+
+ SortRules(botInfo, n, &ruleInfos);
+
+ botInfo.DisallowAll = CheckAllowDisallowAll(botId, true);
+ botInfo.AllowAll = CheckAllowDisallowAll(botId, false);
+
+ for (int i = 0; i < n; ++i)
+ countPrefix += !!isupper(*(botInfo.Rules.Get()[i] - 1));
+
+ botInfo.PrefixRules.Init(countPrefix);
+ botInfo.ComplexRules.Reset(new char**[botInfo.ComplexRulesSize = n - countPrefix]);
+ botInfo.ComplexRulesPosition = 0;
+
+ for (int i = 0; i < n; ++i) {
+ char* s = botInfo.Rules.Get()[i];
+ if (isupper(*(s - 1)))
+ botInfo.PrefixRules.Add(s, i);
+ else
+ botInfo.ComplexRules.Get()[botInfo.ComplexRulesPosition++] = &botInfo.Rules.Get()[i];
+ }
+ botInfo.PrefixRules.Compress();
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::AfterParse(const ui32 botId) {
+ CheckBotIdValidity(botId);
+
+ auto& botInfo = GetInfo(botId);
+
+ ResizeBuffer(botId, botInfo.BufferPosition);
+ SaveRulesFromBuffer(botId);
+
+ if (botInfo.RulesPosition == 0) {
+ AddRule(botId, "/", 'A');
+ }
+}
+
+TPrefixTreeRobotsTxtRulesHandler::TPrefixTreeBotInfo& TPrefixTreeRobotsTxtRulesHandler::GetInfo(ui32 botId) {
+ Y_ENSURE(botId < robotstxtcfg::max_botid);
+ auto& res = BotIdToPrefixTreeBotInfo[botId];
+ if (!res) {
+ res = MakeHolder<TPrefixTreeBotInfo>();
+ }
+ return *res;
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::CheckRule(const char* s, const char* rule) {
+ const char* r = rule;
+ const char* s_end = s + strlen(s);
+ const char* r_end = r + strlen(r);
+ // assert( r && !strstr(r, "**") );
+ for (; *s; ++s) {
+ if ((s_end - s + 1) * 2 < (r_end - r))
+ return 0;
+ while (*r == '*')
+ ++r;
+
+ if (*s == *r) {
+ ++r;
+ } else {
+ while (r != rule && *r != '*')
+ --r;
+
+ if (*r != '*')
+ return 0;
+ if (*r == '*')
+ ++r;
+ if (*r == *s)
+ ++r;
+ }
+ }
+ return !*r || (!*(r + 1) && *r == '*');
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::IsFull(ui32 botId, size_t length) const {
+ Y_ENSURE(botId < robotstxtcfg::max_botid);
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[botId];
+ if (!botInfo) {
+ return false;
+ }
+
+ return (size_t(botInfo->RulesPosition) >= MaxRulesNumber) || (botInfo->BufferPosition + length + 300 > size_t(RobotsMaxSize));
+}
+
+bool TPrefixTreeRobotsTxtRulesHandler::IsFullTotal() const {
+ size_t allBotsRulesCount = 0;
+ size_t allBotsBufferSize = 0;
+
+ for (const auto& botInfo : BotIdToPrefixTreeBotInfo) {
+ if (botInfo) {
+ allBotsRulesCount += botInfo->RulesPosition;
+ allBotsBufferSize += botInfo->BufferPosition;
+ }
+ }
+
+ return (allBotsRulesCount >= MaxRulesNumber) || (allBotsBufferSize + 300 > size_t(RobotsMaxSize));
+}
+
+size_t TPrefixTreeRobotsTxtRulesHandler::GetPacked(const char*& data) const {
+ Y_STATIC_THREAD(TBuffer)
+ packedRepresentation;
+
+ // calculate size, needed for packed data
+ size_t totalPackedSize = sizeof(ui32); // num of botids
+ ui32 numOfSupportedBots = 0;
+
+ for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (!IsBotIdSupported(botId)) {
+ continue;
+ }
+
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)];
+ // botId + packedDataSize + packedData
+ totalPackedSize += sizeof(ui32) + (botInfo ? botInfo->BufferPosition : sizeof(ui32));
+ ++numOfSupportedBots;
+ }
+
+ ((TBuffer&)packedRepresentation).Reserve(totalPackedSize);
+
+ // fill packed data
+ char* packedPtr = ((TBuffer&)packedRepresentation).Data();
+
+ *((ui32*)packedPtr) = numOfSupportedBots;
+ packedPtr += sizeof(ui32);
+
+ for (size_t botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (!IsBotIdSupported(botId)) {
+ continue;
+ }
+
+ const auto& botInfo = BotIdToPrefixTreeBotInfo[GetNotOptimizedBotId(botId)];
+ memcpy(packedPtr, &botId, sizeof(ui32));
+ packedPtr += sizeof(ui32);
+
+ if (botInfo) {
+ *((ui32*)botInfo->Buffer.Get()) = botInfo->BufferPosition;
+ memcpy(packedPtr, botInfo->Buffer.Get(), botInfo->BufferPosition);
+ packedPtr += botInfo->BufferPosition;
+ } else {
+ // In absense of bot info we serialize only size of its buffer, which is 4 because it takes 4 bytes
+ ui32 emptyBufferPosition = sizeof(ui32);
+ memcpy(packedPtr, &emptyBufferPosition, sizeof(ui32));
+ packedPtr += sizeof(ui32);
+ }
+ }
+
+ data = ((TBuffer&)packedRepresentation).Data();
+ return totalPackedSize;
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::LoadPacked(const char* botsData, const char* botsDataEnd) {
+ Clear();
+
+ if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) {
+ ythrow yexception() << "Buffer overflow";
+ }
+
+ ui32 numOfBots = *((ui32*)botsData);
+ botsData += sizeof(ui32);
+
+ for (ui32 botIndex = 0; botIndex < numOfBots; ++botIndex) {
+ if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) {
+ ythrow yexception() << "Buffer overflow";
+ }
+
+ ui32 botId = 0;
+ memcpy(&botId, botsData, sizeof(ui32));
+ botsData += sizeof(ui32);
+
+ // skip bot id's, that not supported for now
+ if (botId >= robotstxtcfg::max_botid || !IsBotIdSupported(botId)) {
+ if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) {
+ ythrow yexception() << "Buffer overflow";
+ }
+
+ ui32 oneBotPackedSize = 0;
+ memcpy(&oneBotPackedSize, botsData, sizeof(ui32));
+ botsData += oneBotPackedSize;
+
+ continue;
+ }
+
+ //SupportedBotIds.insert(botId);
+
+ auto& botInfo = GetInfo(botId);
+
+ if (Y_UNLIKELY(botsDataEnd != nullptr && botsData >= botsDataEnd)) {
+ ythrow yexception() << "Buffer overflow";
+ }
+
+ static_assert(sizeof(botInfo.BufferSize) == sizeof(ui32), "BufferSize must be 4 bytes");
+ static_assert(sizeof(botInfo.BufferPosition) == sizeof(ui32), "BufferPosition must be 4 bytes");
+
+ memcpy(&botInfo.BufferSize, botsData, sizeof(ui32));
+ memcpy(&botInfo.BufferPosition, botsData, sizeof(ui32));
+
+ if (Y_UNLIKELY(botsDataEnd != nullptr && (botsData + botInfo.BufferSize) > botsDataEnd)) {
+ ythrow yexception() << "Buffer overflow";
+ }
+
+ botInfo.Buffer.Reset(new char[botInfo.BufferSize]);
+ memcpy(botInfo.Buffer.Get(), botsData, botInfo.BufferSize);
+ SaveRulesFromBuffer(botId);
+
+ if (botInfo.BufferSize > (int)sizeof(ui32)) { // empty data for robots means, that we don't have section for this bot
+ LoadedBotIds.insert(botId);
+ }
+
+ botsData += botInfo.BufferSize;
+ }
+
+ OptimizeSize();
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::Dump(const ui32 botId, FILE* dumpFile) {
+ if (!dumpFile)
+ dumpFile = stderr;
+ fprintf(dumpFile, "User-Agent: %s\n", robotstxtcfg::GetFullName(botId).data());
+ for (TRobotsTxtRulesIterator it = GetRulesIterator(botId); it.HasRule(); it.Next())
+ fprintf(dumpFile, "%s: %s\n", DirTypeToName(it.GetRuleType()), it.GetInitialRule().data());
+}
+
+void TPrefixTreeRobotsTxtRulesHandler::Dump(const ui32 botId, IOutputStream& out) {
+ out << "User-Agent: " << robotstxtcfg::GetFullName(botId) << Endl;
+ for (TRobotsTxtRulesIterator it = GetRulesIterator(botId); it.HasRule(); it.Next())
+ out << DirTypeToName(it.GetRuleType()) << ": " << it.GetInitialRule() << Endl;
+}
diff --git a/library/cpp/robots_txt/robots_txt.h b/library/cpp/robots_txt/robots_txt.h
new file mode 100644
index 0000000000..5ee48fb14f
--- /dev/null
+++ b/library/cpp/robots_txt/robots_txt.h
@@ -0,0 +1,605 @@
+#pragma once
+
+#include "constants.h"
+#include "robots_txt_parser.h"
+#include "prefix_tree.h"
+#include "robotstxtcfg.h"
+
+#include <util/generic/noncopyable.h>
+#include <util/generic/map.h>
+#include <util/generic/maybe.h>
+#include <util/generic/ptr.h>
+#include <util/generic/set.h>
+
+#include <array>
+#include <utility>
+
+
+enum EDirectiveType {
+ USER_AGENT = 1,
+ DISALLOW = 2,
+ ALLOW = 3,
+ HOST = 4,
+ SITEMAP = 5,
+ CRAWL_DELAY = 6,
+ CLEAN_PARAM = 7,
+ UNKNOWN = 9,
+};
+
+enum EFormatErrorType {
+ ERROR_RULE_NOT_SLASH = 1,
+ ERROR_ASTERISK_MULTI = 2,
+ ERROR_HOST_MULTI = 3,
+ ERROR_ROBOTS_HUGE = 4,
+ ERROR_RULE_BEFORE_USER_AGENT = 5,
+ ERROR_RULE_HUGE = 6,
+ ERROR_HOST_FORMAT = 7,
+ ERROR_TRASH = 8,
+ ERROR_SITEMAP_FORMAT = 9,
+ ERROR_CRAWL_DELAY_FORMAT = 10,
+ ERROR_CRAWL_DELAY_MULTI = 11,
+ ERROR_CLEAN_PARAM_FORMAT = 12,
+
+ WARNING_EMPTY_RULE = 30,
+ WARNING_SUSPECT_SYMBOL = 31,
+ WARNING_UNKNOWN_FIELD = 33,
+ WARNING_UPPER_REGISTER = 34,
+ WARNING_SITEMAP = 35,
+};
+
+class TRobotsTxtRulesIterator {
+private:
+ const char* Begin = nullptr;
+ const char* End = nullptr;
+
+public:
+ TRobotsTxtRulesIterator() = default;
+ TRobotsTxtRulesIterator(const char* begin, const char* end);
+ void Next();
+ bool HasRule() const;
+ const char* GetRule() const;
+ TString GetInitialRule() const; // unlike GetRule(), it neither omits trailing '$' nor adds redundant '*'
+ EDirectiveType GetRuleType() const;
+
+ static EDirectiveType CharToDirType(char ch);
+};
+
+class TRobotsTxtRulesHandlerBase {
+public:
+ typedef TVector<std::pair<EFormatErrorType, int>> TErrorVector;
+
+ TRobotsTxtRulesHandlerBase(
+ TBotIdSet supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot);
+
+ TRobotsTxtRulesHandlerBase(
+ const TSet<ui32>& supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot);
+
+ virtual ~TRobotsTxtRulesHandlerBase();
+
+ int GetCrawlDelay(ui32 botId, bool* realInfo = nullptr) const;
+ int GetMinCrawlDelay(int defaultCrawlDelay = -1) const;
+ bool IsHandlingErrors() const;
+ const TString& GetHostDirective() const;
+ const TVector<TString> GetSiteMaps() const;
+ const TVector<TString> GetCleanParams() const;
+ const TErrorVector& GetErrors() const;
+ TVector<int> GetAcceptedLines(ui32 botId = robotstxtcfg::id_yandexbot) const;
+
+ template <class THostHandler>
+ static int ParseRules(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, THostHandler* hostHandler, const char* host = nullptr);
+ static inline void ClearAllExceptCrossSection(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, ui32 botId);
+ static int CheckHost(const char* host);
+ static int CheckSitemapUrl(const char* url, const char* host, TString& modifiedUrl);
+ static int CheckRule(const char* value, int line, TRobotsTxtRulesHandlerBase* rulesHandler);
+ static int CheckAndNormCleanParam(TString& s);
+ static int ParseCrawlDelay(const char* value, int& crawlDelay);
+ static EDirectiveType NameToDirType(const char* d);
+ static const char* DirTypeToName(EDirectiveType t);
+
+ void SetErrorsHandling(bool handleErrors);
+ void SetHostDirective(const char* hostDirective);
+ void SetCrawlDelay(ui32 botId, int crawlDelay);
+ void AddAcceptedLine(ui32 line, const TBotIdSet& botIds, bool isCrossSection);
+ void AddSiteMap(const char* sitemap);
+ void AddCleanParam(const char* cleanParam);
+ bool AddRuleWithErrorCheck(ui32 botId, TStringBuf rule, char type, TRobotsTxtParser& parser);
+ int OnHost(ui32 botId, TRobotsTxtParser& parser, const char* value, TRobotsTxtRulesHandlerBase*& rulesHandler);
+
+ virtual void Clear();
+ virtual bool IsAllowAll(ui32 botId) const = 0;
+ virtual bool IsAllowAll() const = 0;
+ virtual bool IsDisallowAll(ui32 botId, bool useAny = true) const = 0;
+ virtual bool IsDisallowAll() const = 0;
+ virtual const char* IsDisallow(ui32 botId, const char* s, bool useAny = true) const = 0;
+ virtual const char* IsAllow(ui32 botId, const char* s) const = 0;
+ virtual TRobotsTxtRulesIterator GetRulesIterator(ui32 botId) const = 0;
+ virtual void Dump(ui32 botId, FILE* logFile) = 0;
+ virtual void Dump(ui32 botId, IOutputStream& out) = 0;
+ virtual bool Empty(ui32 botId) const = 0;
+ virtual void LoadPacked(const char* botsData, const char* botsDataEnd = nullptr) = 0;
+ virtual size_t GetPacked(const char*& data) const = 0;
+ virtual void AfterParse(ui32 botId) = 0;
+ virtual void DoAllowAll() = 0;
+ virtual void DoDisallowAll() = 0;
+ bool IsBotIdLoaded(ui32 botId) const;
+ bool IsBotIdSupported(ui32 botId) const;
+ ui32 GetNotOptimizedBotId(ui32 botId) const;
+ TMaybe<ui32> GetMappedBotId(ui32 botId, bool useAny = true) const;
+
+protected:
+ void CheckBotIdValidity(ui32 botId) const;
+ virtual bool OptimizeSize() = 0;
+
+private:
+ bool HandleErrors;
+
+protected:
+ struct TBotInfo {
+ int CrawlDelay;
+
+ TBotInfo()
+ : CrawlDelay(-1)
+ {
+ }
+ };
+
+ TBotIdSet LoadedBotIds;
+ TSet<TString> SiteMaps;
+ TSet<TString> CleanParams;
+ TString HostDirective;
+ TErrorVector Errors;
+ typedef std::pair<ui32, ui32> TBotIdAcceptedLine;
+ TVector<TBotIdAcceptedLine> AcceptedLines;
+ TVector<ui32> CrossSectionAcceptedLines;
+
+ TVector<TBotInfo> BotIdToInfo;
+ int CrawlDelay;
+ size_t RobotsMaxSize;
+ size_t MaxRulesNumber;
+ bool SaveDataForAnyBot;
+
+ TBotIdSet SupportedBotIds;
+ std::array<ui8, robotstxtcfg::max_botid> OptimizedBotIdToStoredBotId;
+
+ virtual bool IsFull(ui32 botId, size_t length) const = 0;
+ virtual bool IsFullTotal() const = 0;
+ virtual bool AddRule(ui32 botId, TStringBuf rule, char type) = 0;
+ //parts of ParseRules
+ inline static void CheckRobotsLines(TRobotsTxtRulesHandlerBase* rulesHandler, TVector<int>& nonRobotsLines);
+ inline static void CheckAsterisk(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber, bool& wasAsterisk);
+ inline static bool CheckWasUserAgent(TRobotsTxtRulesHandlerBase* rulesHandler, bool wasUserAgent, bool& ruleBeforeUserAgent, bool& wasRule, ui32 lineNumber);
+ inline static bool CheckRuleNotSlash(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber);
+ inline static bool CheckSupportedBots(const TBotIdSet& currentBotIds, TBotIdSet& wasRuleForBot, const TBotIdSet& isSupportedBot);
+ inline static bool CheckEmptyRule(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, EDirectiveType& type, ui32 lineNumber);
+ inline static bool ProcessSitemap(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, const char* value, const char* host);
+ inline static bool ProcessCleanParam(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, TString& value);
+ inline static bool AddRules(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const char* value,
+ char type,
+ const TBotIdSet& currentBotIds,
+ const TBotIdSet& isSupportedBot);
+
+ inline static bool ProcessCrawlDelay(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const TBotIdSet& currentBotIds,
+ const TBotIdSet& isSupportedBot,
+ const char* value);
+
+ inline static void ProcessUserAgent(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const TBotIdSet& currentBotIds,
+ TBotIdSet& wasRuleForBot,
+ TBotIdSet& isSupportedBot,
+ TVector<ui32>& botIdToMaxAppropriateUserAgentNameLength,
+ const char* value);
+
+ bool CheckRobot(
+ const char* userAgent,
+ TBotIdSet& botIds,
+ const TVector<ui32>* botIdToMaxAppropriateUserAgentNameLength = nullptr) const;
+
+ virtual void ClearInternal(ui32 botId);
+
+ void AddError(EFormatErrorType type, int line);
+
+ void ResetOptimized() noexcept;
+};
+
+class TPrefixTreeRobotsTxtRulesHandler: public TRobotsTxtRulesHandlerBase, TNonCopyable {
+private:
+ static const int INIT_BUFFER_SIZE = 1 << 6;
+
+ struct TRuleInfo {
+ size_t Len;
+ bool Allow;
+ };
+
+ bool IsFull(ui32 botId, size_t length) const override;
+ bool IsFullTotal() const override;
+ bool AddRule(ui32 botId, TStringBuf rule, char type) override;
+ const char* GetRule(ui32 botId, const char* s, char type) const;
+ void ResizeBuffer(ui32 botId, int newSize);
+ void SaveRulesFromBuffer(ui32 botId);
+ int TraceBuffer(ui32 botId, int countRules, const TArrayHolder<TRuleInfo>* ruleInfos);
+ bool CheckAllowDisallowAll(ui32 botId, bool checkDisallow);
+ void SaveRulesToBuffer();
+ int StrLenWithoutStars(const char* s);
+
+protected:
+ class TRulesSortFunc {
+ private:
+ const TArrayHolder<TRuleInfo>* RuleInfos;
+
+ public:
+ TRulesSortFunc(const TArrayHolder<TRuleInfo>* ruleInfos)
+ : RuleInfos(ruleInfos)
+ {
+ }
+ bool operator()(const size_t& lhs, const size_t& rhs) {
+ const TRuleInfo& left = (*RuleInfos).Get()[lhs];
+ const TRuleInfo& right = (*RuleInfos).Get()[rhs];
+ return (left.Len == right.Len) ? left.Allow && !right.Allow : left.Len > right.Len;
+ }
+ };
+
+ struct TPrefixTreeBotInfo {
+ bool DisallowAll = false;
+ bool AllowAll = false;
+ bool HasDisallow = false;
+ bool HasAllow = false;
+
+ TArrayHolder<char> Buffer{new char[INIT_BUFFER_SIZE]};
+ ui32 BufferPosition = sizeof(BufferPosition);
+ int BufferSize = INIT_BUFFER_SIZE;
+
+ TArrayHolder<char*> Rules = nullptr;
+ int RulesPosition = 0;
+ int RulesSize = 0;
+
+ TArrayHolder<char**> ComplexRules = nullptr;
+ int ComplexRulesPosition = 0;
+ int ComplexRulesSize = 0;
+
+ TPrefixTree PrefixRules {0};
+ };
+
+ std::array<THolder<TPrefixTreeBotInfo>, robotstxtcfg::max_botid> BotIdToPrefixTreeBotInfo;
+
+ TPrefixTreeBotInfo& GetInfo(ui32 botId);
+ static bool CheckRule(const char* s, const char* rule);
+ void ClearInternal(ui32 botId) override;
+ bool OptimizeSize() override;
+
+private:
+ void SortRules(TPrefixTreeBotInfo& prefixBotInfo, size_t count, const TArrayHolder<TRuleInfo>* ruleInfos);
+ bool HasDisallowRulePrevAllowAll(const TPrefixTreeBotInfo& prefixBotInfo, int ruleAllAllow);
+ int FindRuleAll(const TPrefixTreeBotInfo& prefixBotInfo, char neededType);
+
+public:
+ TPrefixTreeRobotsTxtRulesHandler(
+ TBotIdSet supportedBotIds = robotstxtcfg::defaultSupportedBotIds,
+ int robotsMaxSize = robots_max,
+ int maxRulesCount = -1,
+ bool saveDataForAnyBot = true);
+
+ TPrefixTreeRobotsTxtRulesHandler(
+ std::initializer_list<ui32> supportedBotIds,
+ int robotsMaxSize = robots_max,
+ int maxRulesCount = -1,
+ bool saveDataForAnyBot = true);
+
+ TPrefixTreeRobotsTxtRulesHandler(
+ const TSet<ui32>& supportedBotIds,
+ int robotsMaxSize = robots_max,
+ int maxRulesCount = -1,
+ bool saveDataForAnyBot = true);
+
+ void Clear() override;
+ void AfterParse(ui32 botId) override;
+ bool IsAllowAll(ui32 botId) const override;
+ bool IsAllowAll() const override;
+ bool IsDisallowAll(ui32 botId, bool useAny = true) const override;
+ bool IsDisallowAll() const override;
+ const char* IsDisallow(ui32 botId, const char* s, bool useAny = true) const override;
+ const char* IsAllow(ui32 botId, const char* s) const override;
+ TRobotsTxtRulesIterator GetRulesIterator(ui32 botId) const override;
+ void DoAllowAll() override;
+ void DoDisallowAll() override;
+ bool Empty(ui32 botId) const override;
+
+ void LoadPacked(const char* botsData, const char* botsDataEnd = nullptr) override;
+ size_t GetPacked(const char*& data) const override;
+ void Dump(ui32 botId, FILE* logFile) override;
+ void Dump(ui32 botId, IOutputStream& out) override;
+ size_t GetMemorySize();
+};
+
+using TRobotsTxt = TPrefixTreeRobotsTxtRulesHandler;
+
+void TRobotsTxtRulesHandlerBase::ClearAllExceptCrossSection(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, ui32 botId) {
+ rulesHandler->ClearInternal(botId);
+ if (botId == robotstxtcfg::id_anybot) {
+ // as sitemaps, clean-params and HostDirective from prefix tree was deleted
+ for (const auto& sitemap : rulesHandler->SiteMaps) {
+ rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, sitemap, 'S', parser);
+ }
+ for (const auto& param : rulesHandler->CleanParams) {
+ rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, param, 'P', parser);
+ }
+ if (!rulesHandler->HostDirective.empty()) {
+ rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, rulesHandler->HostDirective, 'H', parser);
+ }
+ }
+}
+
+void TRobotsTxtRulesHandlerBase::CheckRobotsLines(TRobotsTxtRulesHandlerBase* rulesHandler, TVector<int>& nonRobotsLines) {
+ if (rulesHandler->IsHandlingErrors()) {
+ for (size_t i = 0; i < nonRobotsLines.size(); ++i)
+ rulesHandler->AddError(ERROR_TRASH, nonRobotsLines[i]);
+ nonRobotsLines.clear();
+ }
+}
+
+void TRobotsTxtRulesHandlerBase::CheckAsterisk(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber, bool& wasAsterisk) {
+ if (strcmp(value, "*") == 0) {
+ if (wasAsterisk)
+ rulesHandler->AddError(ERROR_ASTERISK_MULTI, lineNumber);
+ wasAsterisk = true;
+ }
+}
+
+bool TRobotsTxtRulesHandlerBase::CheckWasUserAgent(TRobotsTxtRulesHandlerBase* rulesHandler, bool wasUserAgent, bool& ruleBeforeUserAgent, bool& wasRule, ui32 lineNumber) {
+ if (wasUserAgent) {
+ wasRule = true;
+ return false;
+ }
+ if (!ruleBeforeUserAgent) {
+ ruleBeforeUserAgent = true;
+ rulesHandler->AddError(ERROR_RULE_BEFORE_USER_AGENT, lineNumber);
+ }
+ return true;
+}
+
+bool TRobotsTxtRulesHandlerBase::CheckRuleNotSlash(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, ui32 lineNumber) {
+ if (*value && *value != '/' && *value != '*') {
+ rulesHandler->AddError(ERROR_RULE_NOT_SLASH, lineNumber);
+ return true;
+ }
+ return false;
+}
+
+bool TRobotsTxtRulesHandlerBase::CheckSupportedBots(
+ const TBotIdSet& currentBotIds,
+ TBotIdSet& wasRuleForBot,
+ const TBotIdSet& isSupportedBot)
+{
+ bool hasAtLeastOneSupportedBot = false;
+ for (ui32 currentBotId : currentBotIds) {
+ wasRuleForBot.insert(currentBotId);
+ hasAtLeastOneSupportedBot = hasAtLeastOneSupportedBot || isSupportedBot.contains(currentBotId);
+ }
+ return hasAtLeastOneSupportedBot;
+}
+
+bool TRobotsTxtRulesHandlerBase::CheckEmptyRule(TRobotsTxtRulesHandlerBase* rulesHandler, const char* value, EDirectiveType& type, ui32 lineNumber) {
+ if (value && strlen(value) == 0) {
+ rulesHandler->AddError(WARNING_EMPTY_RULE, lineNumber);
+ type = type == ALLOW ? DISALLOW : ALLOW;
+ return true;
+ }
+ return false;
+}
+
+bool TRobotsTxtRulesHandlerBase::AddRules(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const char* value,
+ char type,
+ const TBotIdSet& currentBotIds,
+ const TBotIdSet& isSupportedBot)
+{
+ for (ui32 currentBotId : currentBotIds) {
+ if (!isSupportedBot.contains(currentBotId))
+ continue;
+ if (!rulesHandler->AddRuleWithErrorCheck(currentBotId, value, type, parser))
+ return true;
+ }
+ return false;
+}
+
+bool TRobotsTxtRulesHandlerBase::ProcessSitemap(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, const char* value, const char* host) {
+ TString modifiedUrl;
+ if (!CheckSitemapUrl(value, host, modifiedUrl))
+ rulesHandler->AddError(ERROR_SITEMAP_FORMAT, parser.GetLineNumber());
+ else {
+ rulesHandler->AddSiteMap(modifiedUrl.data());
+ if (!rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, modifiedUrl.data(), 'S', parser))
+ return true;
+ }
+ return false;
+}
+
+bool TRobotsTxtRulesHandlerBase::ProcessCleanParam(TRobotsTxtRulesHandlerBase* rulesHandler, TRobotsTxtParser& parser, TString& value) {
+ if (!CheckAndNormCleanParam(value))
+ rulesHandler->AddError(ERROR_CLEAN_PARAM_FORMAT, parser.GetLineNumber());
+ else {
+ rulesHandler->AddCleanParam(value.data());
+ if (!rulesHandler->AddRuleWithErrorCheck(robotstxtcfg::id_anybot, value.data(), 'P', parser))
+ return true;
+ }
+ return false;
+}
+
+bool TRobotsTxtRulesHandlerBase::ProcessCrawlDelay(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const TBotIdSet& currentBotIds,
+ const TBotIdSet& isSupportedBot,
+ const char* value) {
+ for (ui32 currentBotId : currentBotIds) {
+ if (!isSupportedBot.contains(currentBotId))
+ continue;
+ if (rulesHandler->BotIdToInfo[currentBotId].CrawlDelay >= 0) {
+ rulesHandler->AddError(ERROR_CRAWL_DELAY_MULTI, parser.GetLineNumber());
+ break;
+ }
+ int crawlDelay = -1;
+ if (!ParseCrawlDelay(value, crawlDelay))
+ rulesHandler->AddError(ERROR_CRAWL_DELAY_FORMAT, parser.GetLineNumber());
+ else {
+ rulesHandler->SetCrawlDelay(currentBotId, crawlDelay);
+ if (!rulesHandler->AddRuleWithErrorCheck(currentBotId, value, 'C', parser))
+ return true;
+ }
+ }
+ return false;
+}
+
+void TRobotsTxtRulesHandlerBase::ProcessUserAgent(
+ TRobotsTxtRulesHandlerBase* rulesHandler,
+ TRobotsTxtParser& parser,
+ const TBotIdSet& currentBotIds,
+ TBotIdSet& wasSupportedBot,
+ TBotIdSet& isSupportedBot,
+ TVector<ui32>& botIdToMaxAppropriateUserAgentNameLength,
+ const char* value)
+{
+ ui32 userAgentNameLength = (ui32)strlen(value);
+
+ for (ui32 currentBotId : currentBotIds) {
+ bool userAgentNameLonger = userAgentNameLength > botIdToMaxAppropriateUserAgentNameLength[currentBotId];
+ bool userAgentNameSame = userAgentNameLength == botIdToMaxAppropriateUserAgentNameLength[currentBotId];
+
+ if (!wasSupportedBot.contains(currentBotId) || userAgentNameLonger)
+ ClearAllExceptCrossSection(parser, rulesHandler, currentBotId);
+
+ wasSupportedBot.insert(currentBotId);
+ if (userAgentNameLonger || userAgentNameSame) {
+ isSupportedBot.insert(currentBotId); // Allow multiple blocks for the same user agent
+ }
+ botIdToMaxAppropriateUserAgentNameLength[currentBotId] = Max(userAgentNameLength, botIdToMaxAppropriateUserAgentNameLength[currentBotId]);
+ }
+}
+
+template <class THostHandler>
+int TRobotsTxtRulesHandlerBase::ParseRules(TRobotsTxtParser& parser, TRobotsTxtRulesHandlerBase* rulesHandler, THostHandler* hostHandler, const char* host) {
+ rulesHandler->Clear();
+
+ TBotIdSet wasSupportedBot;
+ TBotIdSet wasRuleForBot;
+ bool wasAsterisk = false;
+ TVector<int> nonRobotsLines;
+ TVector<ui32> botIdToMaxAppropriateUserAgentNameLength(robotstxtcfg::max_botid, 0);
+ static char all[] = "/";
+ EDirectiveType prevType = USER_AGENT;
+ while (parser.HasRecord()) {
+ TRobotsTxtRulesRecord record = parser.NextRecord();
+ bool wasUserAgent = false;
+ bool isRobotsRecordUseful = false;
+ TBotIdSet isSupportedBot;
+ TBotIdSet currentBotIds;
+ TString field;
+ TString value;
+ bool ruleBeforeUserAgent = false;
+ int ret = 0;
+ bool wasRule = false;
+ bool wasBlank = false;
+ while (record.NextPair(field, value, isRobotsRecordUseful && rulesHandler->IsHandlingErrors(), nonRobotsLines, &wasBlank)) {
+ CheckRobotsLines(rulesHandler, nonRobotsLines);
+ EDirectiveType type = NameToDirType(field.data());
+ EDirectiveType typeBeforeChange = type;
+
+ if ((prevType != type || wasBlank) && type == USER_AGENT) {
+ currentBotIds.clear();
+ }
+ prevType = type;
+
+ switch (type) {
+ case USER_AGENT:
+ if (wasUserAgent && wasRule) {
+ wasRule = false;
+ currentBotIds.clear();
+ isSupportedBot.clear();
+ }
+ wasUserAgent = true;
+ value.to_lower();
+ CheckAsterisk(rulesHandler, value.data(), parser.GetLineNumber(), wasAsterisk);
+ isRobotsRecordUseful = rulesHandler->CheckRobot(value.data(), currentBotIds, &botIdToMaxAppropriateUserAgentNameLength);
+ if (isRobotsRecordUseful)
+ ProcessUserAgent(rulesHandler, parser, currentBotIds, wasSupportedBot, isSupportedBot, botIdToMaxAppropriateUserAgentNameLength, value.data());
+ break;
+
+ case DISALLOW:
+ case ALLOW:
+ if (CheckWasUserAgent(rulesHandler, wasUserAgent, ruleBeforeUserAgent, wasRule, parser.GetLineNumber()))
+ break;
+ if (CheckRuleNotSlash(rulesHandler, value.data(), parser.GetLineNumber()))
+ break;
+ CheckRule(value.data(), parser.GetLineNumber(), rulesHandler);
+ if (!CheckSupportedBots(currentBotIds, wasRuleForBot, isSupportedBot)) {
+ break;
+ }
+ if (CheckEmptyRule(rulesHandler, value.data(), type, parser.GetLineNumber())) {
+ value = all;
+ if (typeBeforeChange == ALLOW)
+ continue;
+ }
+
+ if (AddRules(rulesHandler, parser, value.data(), type == ALLOW ? 'A' : 'D', currentBotIds, isSupportedBot))
+ return 2;
+ break;
+
+ case HOST:
+ value.to_lower();
+ ret = hostHandler->OnHost(robotstxtcfg::id_anybot, parser, value.data(), rulesHandler);
+ if (ret)
+ return ret;
+ break;
+
+ case SITEMAP:
+ if (ProcessSitemap(rulesHandler, parser, value.data(), host))
+ return 2;
+ break;
+
+ case CLEAN_PARAM:
+ if (ProcessCleanParam(rulesHandler, parser, value))
+ return 2;
+ break;
+
+ case CRAWL_DELAY:
+ if (ProcessCrawlDelay(rulesHandler, parser, currentBotIds, isSupportedBot, value.data()))
+ return 2;
+ break;
+
+ default:
+ rulesHandler->AddError(WARNING_UNKNOWN_FIELD, parser.GetLineNumber());
+ break;
+ }
+ bool isCrossSection = type == SITEMAP || type == HOST || type == CLEAN_PARAM;
+ if (rulesHandler->IsHandlingErrors() && (isRobotsRecordUseful || isCrossSection))
+ rulesHandler->AddAcceptedLine(parser.GetLineNumber(), currentBotIds, isCrossSection);
+ }
+ }
+
+ for (auto botId : wasSupportedBot) {
+ rulesHandler->LoadedBotIds.insert(botId);
+ if (rulesHandler->IsBotIdSupported(botId))
+ rulesHandler->AfterParse(botId);
+ }
+
+ if (!rulesHandler->OptimizeSize()) {
+ return 2;
+ }
+
+ return 1;
+}
diff --git a/library/cpp/robots_txt/robots_txt_parser.cpp b/library/cpp/robots_txt/robots_txt_parser.cpp
new file mode 100644
index 0000000000..8e2fe6073d
--- /dev/null
+++ b/library/cpp/robots_txt/robots_txt_parser.cpp
@@ -0,0 +1,116 @@
+#include "robots_txt_parser.h"
+#include <util/generic/string.h>
+#include <util/stream/output.h>
+
+TRobotsTxtParser::TRobotsTxtParser(IInputStream& inputStream)
+ : InputStream(inputStream)
+ , LineNumber(0)
+ , IsLastSymbolCR(false)
+{
+}
+
+int TRobotsTxtParser::GetLineNumber() {
+ return LineNumber;
+}
+
+const char* TRobotsTxtParser::ReadLine() {
+ Line = "";
+ char c;
+
+ if (IsLastSymbolCR) {
+ if (!InputStream.ReadChar(c))
+ return nullptr;
+ if (c != '\n')
+ Line.append(c);
+ }
+
+ bool hasMoreSymbols;
+ while (hasMoreSymbols = InputStream.ReadChar(c)) {
+ if (c == '\r') {
+ IsLastSymbolCR = true;
+ break;
+ } else {
+ IsLastSymbolCR = false;
+ if (c == '\n')
+ break;
+ Line.append(c);
+ }
+ }
+ if (!hasMoreSymbols && Line.empty())
+ return nullptr;
+
+ // BOM UTF-8: EF BB BF
+ if (0 == LineNumber && Line.size() >= 3 && Line[0] == '\xEF' && Line[1] == '\xBB' && Line[2] == '\xBF')
+ Line = Line.substr(3, Line.size() - 3);
+
+ ++LineNumber;
+ int i = Line.find('#');
+ if (i == 0)
+ Line = "";
+ else if (i > 0)
+ Line = Line.substr(0, i);
+ return Line.data();
+}
+
+bool TRobotsTxtParser::IsBlankLine(const char* s) {
+ for (const char* p = s; *p; ++p)
+ if (!isspace(*p))
+ return 0;
+ return 1;
+}
+
+char* TRobotsTxtParser::Trim(char* s) {
+ while (isspace(*s))
+ ++s;
+ char* p = s + strlen(s) - 1;
+ while (s < p && isspace(*p))
+ --p;
+ *(p + 1) = 0;
+ return s;
+}
+
+inline bool TRobotsTxtParser::IsRobotsLine(const char* s) {
+ return strchr(s, ':');
+}
+
+bool TRobotsTxtParser::HasRecord() {
+ while (!IsRobotsLine(Line.data()))
+ if (!ReadLine())
+ return 0;
+ return 1;
+}
+
+TRobotsTxtRulesRecord TRobotsTxtParser::NextRecord() {
+ return TRobotsTxtRulesRecord(*this);
+}
+
+TRobotsTxtRulesRecord::TRobotsTxtRulesRecord(TRobotsTxtParser& parser)
+ : Parser(parser)
+{
+}
+
+bool TRobotsTxtRulesRecord::NextPair(TString& field, TString& value, bool handleErrors, TVector<int>& nonRobotsLines, bool* wasBlank) {
+ if (wasBlank) {
+ *wasBlank = false;
+ }
+ while (!Parser.IsRobotsLine(Parser.Line.data())) {
+ if (!Parser.ReadLine())
+ return 0;
+ if (Parser.IsBlankLine(Parser.Line.data())) {
+ if (wasBlank) {
+ *wasBlank = true;
+ }
+ continue;
+ }
+ if (handleErrors && !Parser.IsRobotsLine(Parser.Line.data()))
+ nonRobotsLines.push_back(Parser.GetLineNumber());
+ }
+
+ char* s = strchr(Parser.Line.begin(), ':');
+ *s = 0;
+ char* p = s + 1;
+
+ field = TRobotsTxtParser::Trim(strlwr(Parser.Line.begin()));
+ value = TRobotsTxtParser::Trim(p);
+ return 1;
+}
diff --git a/library/cpp/robots_txt/robots_txt_parser.h b/library/cpp/robots_txt/robots_txt_parser.h
new file mode 100644
index 0000000000..8032d0d20b
--- /dev/null
+++ b/library/cpp/robots_txt/robots_txt_parser.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <algorithm>
+#include <util/generic/string.h>
+#include <util/generic/vector.h>
+#include <util/stream/input.h>
+
+class TRobotsTxtParser;
+
+class TRobotsTxtRulesRecord {
+private:
+ TRobotsTxtParser& Parser;
+
+public:
+ TRobotsTxtRulesRecord(TRobotsTxtParser& parser);
+ bool NextPair(TString& field, TString& value, bool handleErrors, TVector<int>& nonRobotsLines, bool* wasBlank = nullptr);
+};
+
+class TRobotsTxtParser {
+ friend class TRobotsTxtRulesRecord;
+
+private:
+ IInputStream& InputStream;
+ TString Line;
+ int LineNumber;
+ bool IsLastSymbolCR;
+
+ const char* ReadLine();
+ static bool IsBlankLine(const char*);
+ static bool IsRobotsLine(const char*);
+
+public:
+ static char* Trim(char*);
+ TRobotsTxtParser(IInputStream& inputStream);
+ bool HasRecord();
+ TRobotsTxtRulesRecord NextRecord();
+ int GetLineNumber();
+};
diff --git a/library/cpp/robots_txt/robotstxtcfg.h b/library/cpp/robots_txt/robotstxtcfg.h
new file mode 100644
index 0000000000..5ca1682a0c
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h>
diff --git a/library/cpp/robots_txt/robotstxtcfg/CMakeLists.darwin-x86_64.txt b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..09cfd4b3f1
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,20 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-robots_txt-robotstxtcfg)
+target_link_libraries(cpp-robots_txt-robotstxtcfg PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-case_insensitive_string
+)
+target_sources(cpp-robots_txt-robotstxtcfg PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp
+)
diff --git a/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-aarch64.txt b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..6fe7e7a7ad
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,21 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-robots_txt-robotstxtcfg)
+target_link_libraries(cpp-robots_txt-robotstxtcfg PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-case_insensitive_string
+)
+target_sources(cpp-robots_txt-robotstxtcfg PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp
+)
diff --git a/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-x86_64.txt b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..6fe7e7a7ad
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,21 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-robots_txt-robotstxtcfg)
+target_link_libraries(cpp-robots_txt-robotstxtcfg PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-case_insensitive_string
+)
+target_sources(cpp-robots_txt-robotstxtcfg PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp
+)
diff --git a/library/cpp/robots_txt/robotstxtcfg/CMakeLists.txt b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/robots_txt/robotstxtcfg/CMakeLists.windows-x86_64.txt b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..09cfd4b3f1
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,20 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(cpp-robots_txt-robotstxtcfg)
+target_link_libraries(cpp-robots_txt-robotstxtcfg PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-case_insensitive_string
+)
+target_sources(cpp-robots_txt-robotstxtcfg PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp
+ ${CMAKE_SOURCE_DIR}/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp
+)
diff --git a/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp
new file mode 100644
index 0000000000..aec668582c
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.cpp
@@ -0,0 +1,2 @@
+#include "bot_id_set.h"
+// header compile test
diff --git a/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h
new file mode 100644
index 0000000000..08aaa68a50
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/bot_id_set.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include "user_agents.h"
+
+#include <bitset>
+
+
+/// Simple vector-based set for bot ids, meant to optimize memory and lookups
+class TBotIdSet
+{
+public:
+ using TData = std::bitset<robotstxtcfg::max_botid>;
+
+ constexpr TBotIdSet() noexcept = default;
+ constexpr TBotIdSet(const TBotIdSet&) noexcept = default;
+ constexpr TBotIdSet(TBotIdSet&&) noexcept = default;
+ constexpr TBotIdSet& operator = (const TBotIdSet&) noexcept = default;
+ constexpr TBotIdSet& operator = (TBotIdSet&&) noexcept = default;
+
+ TBotIdSet(std::initializer_list<ui32> botIds) {
+ for (auto id : botIds) {
+ insert(id);
+ }
+ }
+
+ static TBotIdSet All() noexcept {
+ TBotIdSet res;
+ res.Bots.set();
+ return res;
+ }
+
+ constexpr bool contains(ui32 botId) const noexcept {
+ return (botId < Bots.size()) && Bots[botId];
+ }
+
+ bool insert(ui32 botId) noexcept {
+ if (botId >= Bots.size() || Bots[botId]) {
+ return false;
+ }
+ Bots[botId] = true;
+ return true;
+ }
+
+ bool remove(ui32 botId) noexcept {
+ if (botId >= Bots.size() || !Bots[botId]) {
+ return false;
+ }
+ Bots[botId] = false;
+ return true;
+ }
+
+ void clear() noexcept {
+ Bots.reset();
+ }
+
+ size_t size() const noexcept {
+ return Bots.count();
+ }
+
+ bool empty() const noexcept {
+ return Bots.none();
+ }
+
+ bool operator==(const TBotIdSet& rhs) const noexcept = default;
+
+ TBotIdSet operator&(TBotIdSet rhs) const noexcept {
+ rhs.Bots &= Bots;
+ return rhs;
+ }
+
+ TBotIdSet operator|(TBotIdSet rhs) const noexcept {
+ rhs.Bots |= Bots;
+ return rhs;
+ }
+
+ TBotIdSet operator~() const noexcept {
+ TBotIdSet result;
+ result.Bots = ~Bots;
+ return result;
+ }
+
+ class iterator
+ {
+ public:
+ auto operator * () const noexcept {
+ return BotId;
+ }
+
+ iterator& operator ++ () noexcept {
+ while (BotId < Bots.size()) {
+ if (Bots[++BotId]) {
+ break;
+ }
+ }
+ return *this;
+ }
+
+ bool operator == (const iterator& rhs) const noexcept {
+ return (&Bots == &rhs.Bots) && (BotId == rhs.BotId);
+ }
+
+ bool operator != (const iterator& rhs) const noexcept {
+ return !(*this == rhs);
+ }
+
+ private:
+ friend class TBotIdSet;
+ iterator(const TData& bots, ui32 botId)
+ : Bots(bots)
+ , BotId(botId)
+ {
+ while (BotId < Bots.size() && !Bots[BotId]) {
+ ++BotId;
+ }
+ }
+
+ private:
+ const TData& Bots;
+ ui32 BotId;
+ };
+
+ iterator begin() const noexcept {
+ return {Bots, robotstxtcfg::id_anybot};
+ }
+
+ iterator end() const noexcept {
+ return {Bots, robotstxtcfg::max_botid};
+ }
+
+private:
+ TData Bots {};
+};
diff --git a/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp
new file mode 100644
index 0000000000..c5652b81c5
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.cpp
@@ -0,0 +1,2 @@
+#include "robotstxtcfg.h"
+// header compile test
diff --git a/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h
new file mode 100644
index 0000000000..2cf9430d7c
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/robotstxtcfg.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "bot_id_set.h"
+
+
+namespace robotstxtcfg {
+
+static const TBotIdSet defaultSupportedBotIds = {id_defbot};
+static const TBotIdSet allSupportedBotIds = TBotIdSet::All();
+
+} // namespace robotstxtcfg
diff --git a/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp b/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp
new file mode 100644
index 0000000000..60b353a427
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/user_agents.cpp
@@ -0,0 +1,2 @@
+#include "user_agents.h"
+// header compile test
diff --git a/library/cpp/robots_txt/robotstxtcfg/user_agents.h b/library/cpp/robots_txt/robotstxtcfg/user_agents.h
new file mode 100644
index 0000000000..59245d07cb
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/user_agents.h
@@ -0,0 +1,303 @@
+#pragma once
+
+#include <library/cpp/case_insensitive_string/case_insensitive_string.h>
+
+
+namespace robotstxtcfg {
+ // robots.txt agents and identifiers
+
+ enum EBots : ui32 {
+ id_anybot = 0,
+ id_yandexbot = 1,
+ id_yandexmediabot = 2,
+ id_yandeximagesbot = 3,
+ id_googlebot = 4,
+ id_yandexbotmirr = 5,
+ id_yahooslurp = 6,
+ id_msnbot = 7,
+ id_yandexcatalogbot = 8,
+ id_yandexdirectbot = 9,
+ id_yandexblogsbot = 10,
+ id_yandexnewsbot = 11,
+ id_yandexpagechk = 12,
+ id_yandexmetrikabot = 13,
+ id_yandexbrowser = 14,
+ id_yandexmarketbot = 15,
+ id_yandexcalendarbot = 16,
+ id_yandexwebmasterbot = 17,
+ id_yandexvideobot = 18,
+ id_yandeximageresizerbot = 19,
+ id_yandexadnetbot = 20,
+ id_yandexpartnerbot = 21,
+ id_yandexdirectdbot = 22,
+ id_yandextravelbot = 23,
+ id_yandexmobilebot = 24,
+ id_yandexrcabot = 25,
+ id_yandexdirectdynbot = 26,
+ id_yandexmobilebot_ed = 27,
+ id_yandexaccessibilitybot = 28,
+ id_baidubot = 29,
+ id_yandexscreenshotbot = 30,
+ id_yandexmetrikayabs = 31,
+ id_yandexvideoparserbot = 32,
+ id_yandexnewsbot4 = 33,
+ id_yandexmarketbot2 = 34,
+ id_yandexmedianabot = 35,
+ id_yandexsearchshopbot = 36,
+ id_yandexontodbbot = 37,
+ id_yandexontodbapibot = 38,
+ id_yandexampbot = 39,
+ id_yandexvideohosting = 40,
+ id_yandexmediaselling = 41,
+ id_yandexverticals = 42,
+ id_yandexturbobot = 43,
+ id_yandexzenbot = 44,
+ id_yandextrackerbot = 45,
+ id_yandexmetrikabot4 = 46,
+ id_yandexmobilescreenshotbot = 47,
+ id_yandexfaviconsbot = 48,
+ id_yandexrenderresourcesbot = 49,
+ id_yandexactivity = 50,
+ max_botid
+ };
+
+ static const ui32 id_defbot = id_yandexbot;
+
+ struct TBotInfo {
+ TCaseInsensitiveStringBuf ReqPrefix;
+ TCaseInsensitiveStringBuf FullName;
+ TStringBuf FromField = {};
+ TStringBuf UserAgent = {};
+ TStringBuf RotorUserAgent = {};
+ bool ExplicitDisallow = false;
+ };
+
+ static constexpr TStringBuf UserAgentFrom("support@search.yandex.ru");
+
+ static constexpr TBotInfo BotInfoArr[] = {
+ {"*", "*"},
+ {"Yandex", "YandexBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Yandex", "YandexMedia/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Yandex", "YandexImages/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Google", "GoogleBot"},
+ {"Yandex", "YandexBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexBot/3.0; MirrorDetector; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Slurp", "Slurp"},
+ {"msn", "msnbot"},
+ {"Yandex", "YandexCatalog/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexCatalog/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexCatalog/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"YaDirectFetcher", "YaDirectFetcher/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+
+ {"Yandex", "YandexBlogs/0.99", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Yandex", "YandexNews/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexNews/3.0; robot; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexNews/3.0; robot; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Yandex", "YandexPagechecker/2.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexPagechecker/2.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexPagechecker/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Yandex", "YandexMetrika/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMetrika/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Yandex", "YandexBrowser/1.0", UserAgentFrom,
+ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) YaBrowser/1.0.1084.5402 Chrome/19.0.1084.5409 Safari/536.5",
+ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) YaBrowser/1.0.1084.5402 Chrome/19.0.1084.5409 Safari/536.5",
+ false},
+ {"Yandex", "YandexMarket/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMarket/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMarket/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"YandexCalendar", "YandexCalendar/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexCalendar/1.0 +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexCalendar/1.0 +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"Yandex", "YandexWebmaster/2.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Yandex", "YandexVideo/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Yandex", "YandexImageResizer/2.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+
+ {"YandexDirect", "YandexDirect/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexDirect/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexDirect/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexPartner", "YandexPartner/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexPartner/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexPartner/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YaDirectFetcher", "YaDirectFetcher/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YaDirectFetcher/1.0; Dyatel; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"Yandex", "YandexTravel/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexTravel/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexTravel/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"Yandex", "YandexBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
+ false},
+ {"YandexRCA", "YandexRCA/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexRCA/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexRCA/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexDirectDyn", "YandexDirectDyn/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexDirectDyn/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexMobileBot", "YandexMobileBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots)",
+ true},
+ {"YandexAccessibilityBot", "YandexAccessibilityBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexAccessibilityBot/3.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"Baidu", "Baiduspider"},
+
+ {"YandexScreenshotBot", "YandexScreenshotBot/3.0", UserAgentFrom,
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexScreenshotBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexScreenshotBot/3.0; +http://yandex.com/bots)",
+ true},
+ {"YandexMetrika", "YandexMetrika/2.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMetrika/2.0; +http://yandex.com/bots yabs01)",
+ "Mozilla/5.0 (compatible; YandexMetrika/2.0; +http://yandex.com/bots yabs01) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexVideoParser", "YandexVideoParser/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexVideoParser/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexVideoParser/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"Yandex", "YandexNews/4.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexNews/4.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexNews/4.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexMarket", "YandexMarket/2.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMarket/2.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMarket/2.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexMedianaBot", "YandexMedianaBot/1.0", UserAgentFrom,
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexMedianaBot/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 (compatible; YandexMedianaBot/1.0; +http://yandex.com/bots)",
+ true},
+ {"YandexSearchShop", "YandexSearchShop/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexSearchShop/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexSearchShop/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"Yandex", "YandexOntoDB/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexOntoDB/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexOntoDB/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ false},
+ {"YandexOntoDBAPI", "YandexOntoDBAPI/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexOntoDBAPI/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexOntoDBAPI/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"Yandex-AMPHTML", "Yandex-AMPHTML", UserAgentFrom,
+ "Mozilla/5.0 (compatible; Yandex-AMPHTML; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; Yandex-AMPHTML; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+
+ {"YandexVideoHosting", "YandexVideoHosting/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexVideoHosting/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexVideoHosting/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexMediaSelling", "YandexMediaSelling/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMediaSelling/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMediaSelling/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexVerticals", "YandexVerticals/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexVerticals/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexVerticals/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexTurbo", "YandexTurbo/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexTurbo/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexTurbo/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexZenRss", "YandexZenRss/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexZenRss/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexZenRss/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexTracker", "YandexTracker/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexTracker/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexTracker/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexMetrika", "YandexMetrika/4.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexMetrika/4.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexMetrika/4.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexMobileScreenShotBot", "YandexMobileScreenShotBot/1.0", UserAgentFrom,
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/11.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/11.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
+ true},
+ {"YandexFavicons", "YandexFavicons/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexRenderResourcesBot", "YandexRenderResourcesBot/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexRenderResourcesBot/1.0; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexRenderResourcesBot/1.0; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true},
+ {"YandexActivity", "YandexActivity/1.0", UserAgentFrom,
+ "Mozilla/5.0 (compatible; YandexActivity; robot; +http://yandex.com/bots)",
+ "Mozilla/5.0 (compatible; YandexActivity; robot; +http://yandex.com/bots) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0",
+ true}
+ };
+
+ static_assert(std::size(BotInfoArr) == max_botid);
+
+ constexpr auto GetReqPrefix(ui32 botId) {
+ return BotInfoArr[botId].ReqPrefix;
+ }
+
+ constexpr auto GetFullName(ui32 botId) {
+ return BotInfoArr[botId].FullName;
+ }
+
+ constexpr auto GetFromField(ui32 botId) {
+ return BotInfoArr[botId].FromField;
+ }
+
+ constexpr auto GetUserAgent(ui32 botId) {
+ return BotInfoArr[botId].UserAgent;
+ }
+
+ constexpr auto GetRotorUserAgent(ui32 botId) {
+ return BotInfoArr[botId].RotorUserAgent;
+ }
+
+ constexpr bool IsExplicitDisallow(ui32 botId) {
+ return BotInfoArr[botId].ExplicitDisallow;
+ }
+
+ constexpr bool IsYandexBotId(ui32 botId) {
+ return !BotInfoArr[botId].UserAgent.empty();
+ }
+
+} // namespace robotstxtcfg
diff --git a/library/cpp/robots_txt/robotstxtcfg/ya.make b/library/cpp/robots_txt/robotstxtcfg/ya.make
new file mode 100644
index 0000000000..61c731be42
--- /dev/null
+++ b/library/cpp/robots_txt/robotstxtcfg/ya.make
@@ -0,0 +1,13 @@
+LIBRARY()
+
+SRCS(
+ bot_id_set.cpp
+ robotstxtcfg.cpp
+ user_agents.cpp
+)
+
+PEERDIR(
+ library/cpp/case_insensitive_string
+)
+
+END()
diff --git a/library/cpp/robots_txt/rules_handler.cpp b/library/cpp/robots_txt/rules_handler.cpp
new file mode 100644
index 0000000000..4297db9d21
--- /dev/null
+++ b/library/cpp/robots_txt/rules_handler.cpp
@@ -0,0 +1,514 @@
+#include "robots_txt.h"
+#include "constants.h"
+
+#include <library/cpp/uri/http_url.h>
+#include <library/cpp/charset/ci_string.h>
+#include <library/cpp/string_utils/url/url.h>
+#include <util/system/maxlen.h>
+#include <util/generic/yexception.h>
+#include <util/generic/algorithm.h>
+
+
+namespace {
+
+TBotIdSet ConvertBotIdSet(const TSet<ui32>& botIds) noexcept {
+ TBotIdSet result;
+ for (auto id : botIds) {
+ result.insert(id);
+ }
+ return result;
+}
+
+} // namespace
+
+TRobotsTxtRulesIterator::TRobotsTxtRulesIterator(const char* begin, const char* end)
+ : Begin(begin)
+ , End(end)
+{
+}
+
+void TRobotsTxtRulesIterator::Next() {
+ while (Begin < End && *Begin)
+ ++Begin;
+ while (Begin < End && !isalpha(*Begin))
+ ++Begin;
+}
+
+bool TRobotsTxtRulesIterator::HasRule() const {
+ return Begin < End;
+}
+
+const char* TRobotsTxtRulesIterator::GetRule() const {
+ return Begin + 1;
+}
+
+TString TRobotsTxtRulesIterator::GetInitialRule() const {
+ auto begin = Begin + 1;
+ TStringBuf rule(begin, strlen(begin));
+
+ switch (*Begin) {
+ case 'a':
+ case 'd':
+ return rule.EndsWith('*') ? TString(rule.Chop(1)) : TString::Join(rule, '$');
+ default:
+ return TString(rule);
+ }
+}
+
+EDirectiveType TRobotsTxtRulesIterator::GetRuleType() const {
+ return CharToDirType(*Begin);
+}
+
+EDirectiveType TRobotsTxtRulesIterator::CharToDirType(char ch) {
+ switch (toupper(ch)) {
+ case 'A':
+ return ALLOW;
+ case 'C':
+ return CRAWL_DELAY;
+ case 'D':
+ return DISALLOW;
+ case 'H':
+ return HOST;
+ case 'P':
+ return CLEAN_PARAM;
+ case 'S':
+ return SITEMAP;
+ }
+ return UNKNOWN;
+}
+
+TRobotsTxtRulesHandlerBase::TRobotsTxtRulesHandlerBase(
+ TBotIdSet supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : HandleErrors(false)
+ , SiteMaps()
+ , CleanParams()
+ , HostDirective("")
+ , Errors()
+ , AcceptedLines()
+ , CrossSectionAcceptedLines()
+ , BotIdToInfo(robotstxtcfg::max_botid)
+ , RobotsMaxSize(robotsMaxSize)
+ , MaxRulesNumber(maxRulesNumber)
+ , SaveDataForAnyBot(saveDataForAnyBot)
+ , SupportedBotIds(supportedBotIds)
+{
+ Y_ENSURE(!supportedBotIds.empty());
+
+ if (RobotsMaxSize <= 0)
+ RobotsMaxSize = robots_max;
+ if (MaxRulesNumber <= 0)
+ MaxRulesNumber = max_rules_count;
+
+ ResetOptimized();
+}
+
+TRobotsTxtRulesHandlerBase::TRobotsTxtRulesHandlerBase(
+ const TSet<ui32>& supportedBotIds,
+ int robotsMaxSize,
+ int maxRulesNumber,
+ bool saveDataForAnyBot)
+ : TRobotsTxtRulesHandlerBase(ConvertBotIdSet(supportedBotIds), robotsMaxSize, maxRulesNumber, saveDataForAnyBot)
+{}
+
+TRobotsTxtRulesHandlerBase::~TRobotsTxtRulesHandlerBase() = default;
+
+void TRobotsTxtRulesHandlerBase::CheckBotIdValidity(const ui32 botId) const {
+ if (botId >= robotstxtcfg::max_botid || !IsBotIdSupported(botId))
+ ythrow yexception() << "robots.txt parser requested for invalid or unsupported botId = " << botId << Endl;
+ ;
+}
+
+int TRobotsTxtRulesHandlerBase::GetCrawlDelay(const ui32 botId, bool* realInfo) const {
+ const auto id = GetMappedBotId(botId, false);
+ if (realInfo)
+ *realInfo = bool(id);
+ return BotIdToInfo[id.GetOrElse(robotstxtcfg::id_anybot)].CrawlDelay;
+}
+
+int TRobotsTxtRulesHandlerBase::GetMinCrawlDelay(int defaultCrawlDelay) const {
+ int res = INT_MAX;
+ bool useDefault = false;
+ for (ui32 botId = 0; botId < robotstxtcfg::max_botid; ++botId) {
+ if (robotstxtcfg::IsYandexBotId(botId) && IsBotIdSupported(botId) && !IsDisallowAll(botId)) {
+ bool realInfo;
+ int curCrawlDelay = GetCrawlDelay(botId, &realInfo);
+ if (realInfo) {
+ if (curCrawlDelay == -1) {
+ useDefault = true;
+ } else {
+ res = Min(res, curCrawlDelay);
+ }
+ }
+ }
+ }
+
+ if (useDefault && defaultCrawlDelay < res) {
+ return -1;
+ }
+
+ if (res == INT_MAX) {
+ res = GetCrawlDelay(robotstxtcfg::id_anybot);
+ }
+
+ return res;
+}
+
+void TRobotsTxtRulesHandlerBase::SetCrawlDelay(const ui32 botId, int crawlDelay) {
+ CheckBotIdValidity(botId);
+ BotIdToInfo[botId].CrawlDelay = crawlDelay;
+}
+
+const TVector<TString> TRobotsTxtRulesHandlerBase::GetSiteMaps() const {
+ return TVector<TString>(SiteMaps.begin(), SiteMaps.end());
+}
+
+void TRobotsTxtRulesHandlerBase::AddSiteMap(const char* sitemap) {
+ SiteMaps.insert(sitemap);
+}
+
+const TVector<TString> TRobotsTxtRulesHandlerBase::GetCleanParams() const {
+ return TVector<TString>(CleanParams.begin(), CleanParams.end());
+}
+
+void TRobotsTxtRulesHandlerBase::AddCleanParam(const char* cleanParam) {
+ CleanParams.insert(cleanParam);
+}
+
+const TString& TRobotsTxtRulesHandlerBase::GetHostDirective() const {
+ return HostDirective;
+}
+
+void TRobotsTxtRulesHandlerBase::SetHostDirective(const char* hostDirective) {
+ HostDirective = hostDirective;
+}
+
+const TRobotsTxtRulesHandlerBase::TErrorVector& TRobotsTxtRulesHandlerBase::GetErrors() const {
+ return Errors;
+}
+
+TVector<int> TRobotsTxtRulesHandlerBase::GetAcceptedLines(const ui32 botId) const {
+ TVector<int> ret;
+ for (size_t i = 0; i < CrossSectionAcceptedLines.size(); ++i)
+ ret.push_back(CrossSectionAcceptedLines[i]);
+
+ bool hasLinesForBotId = false;
+ for (size_t i = 0; i < AcceptedLines.size(); ++i) {
+ if (AcceptedLines[i].first == botId) {
+ hasLinesForBotId = true;
+ break;
+ }
+ }
+
+ for (size_t i = 0; i < AcceptedLines.size(); ++i) {
+ if (hasLinesForBotId && AcceptedLines[i].first == botId) {
+ ret.push_back(AcceptedLines[i].second);
+ } else if (!hasLinesForBotId && AcceptedLines[i].first == robotstxtcfg::id_anybot) {
+ ret.push_back(AcceptedLines[i].second);
+ }
+ }
+
+ Sort(ret.begin(), ret.end());
+
+ return ret;
+}
+
+void TRobotsTxtRulesHandlerBase::AddAcceptedLine(ui32 line, const TBotIdSet& botIds, bool isCrossSection) {
+ if (isCrossSection) {
+ CrossSectionAcceptedLines.push_back(line);
+ return;
+ }
+
+ for (auto botId : botIds) {
+ AcceptedLines.push_back(TBotIdAcceptedLine(botId, line));
+ }
+}
+
+void TRobotsTxtRulesHandlerBase::SetErrorsHandling(bool handleErrors) {
+ HandleErrors = handleErrors;
+}
+
+bool TRobotsTxtRulesHandlerBase::IsHandlingErrors() const {
+ return HandleErrors;
+}
+
+EDirectiveType TRobotsTxtRulesHandlerBase::NameToDirType(const char* d) {
+ if (!strcmp("disallow", d))
+ return DISALLOW;
+ if (!strcmp("allow", d))
+ return ALLOW;
+ if (!strcmp("user-agent", d))
+ return USER_AGENT;
+ if (!strcmp("host", d))
+ return HOST;
+ if (!strcmp("sitemap", d))
+ return SITEMAP;
+ if (!strcmp("clean-param", d))
+ return CLEAN_PARAM;
+ if (!strcmp("crawl-delay", d))
+ return CRAWL_DELAY;
+ return UNKNOWN;
+}
+
+const char* TRobotsTxtRulesHandlerBase::DirTypeToName(EDirectiveType t) {
+ static const char* name[] = {"Allow", "Crawl-Delay", "Disallow", "Host", "Clean-Param", "Sitemap", "User-Agent", "Unknown"};
+ switch (t) {
+ case ALLOW:
+ return name[0];
+ case CRAWL_DELAY:
+ return name[1];
+ case DISALLOW:
+ return name[2];
+ case HOST:
+ return name[3];
+ case CLEAN_PARAM:
+ return name[4];
+ case SITEMAP:
+ return name[5];
+ case USER_AGENT:
+ return name[6];
+ case UNKNOWN:
+ return name[7];
+ }
+ return name[7];
+}
+
+bool TRobotsTxtRulesHandlerBase::CheckRobot(
+ const char* userAgent,
+ TBotIdSet& botIds,
+ const TVector<ui32>* botIdToMaxAppropriateUserAgentNameLength) const
+{
+ TCaseInsensitiveStringBuf agent(userAgent);
+
+ for (size_t botIndex = 0; botIndex < robotstxtcfg::max_botid; ++botIndex) {
+ if (!IsBotIdSupported(botIndex))
+ continue;
+
+ bool hasRequiredAgentNamePrefix = agent.StartsWith(robotstxtcfg::GetReqPrefix(botIndex));
+ bool isContainedInFullName = robotstxtcfg::GetFullName(botIndex).StartsWith(agent);
+ bool wasMoreImportantAgent = false;
+ if (botIdToMaxAppropriateUserAgentNameLength)
+ wasMoreImportantAgent = agent.size() < (*botIdToMaxAppropriateUserAgentNameLength)[botIndex];
+
+ if (hasRequiredAgentNamePrefix && isContainedInFullName && !wasMoreImportantAgent) {
+ botIds.insert(botIndex);
+ }
+ }
+
+ return !botIds.empty();
+}
+
+int TRobotsTxtRulesHandlerBase::CheckRule(const char* value, int line, TRobotsTxtRulesHandlerBase* rulesHandler) {
+ if (!rulesHandler->IsHandlingErrors())
+ return 0;
+
+ if (auto len = strlen(value); len > max_rule_length) {
+ rulesHandler->AddError(ERROR_RULE_HUGE, line);
+ }
+
+ bool upper = false, suspect = false;
+ for (const char* r = value; *r; ++r) {
+ if (!upper && isupper(*r))
+ upper = true;
+ if (!suspect && !isalnum(*r) && !strchr("/_?=.-*%&~[]:;@", *r) && (*(r + 1) || *r != '$'))
+ suspect = true;
+ }
+ if (suspect)
+ rulesHandler->AddError(WARNING_SUSPECT_SYMBOL, line);
+ if (upper)
+ rulesHandler->AddError(WARNING_UPPER_REGISTER, line);
+ return suspect || upper;
+}
+
+void TRobotsTxtRulesHandlerBase::AddError(EFormatErrorType type, int line) {
+ if (!HandleErrors)
+ return;
+ Errors.push_back(std::make_pair(type, line));
+}
+
+void TRobotsTxtRulesHandlerBase::ResetOptimized() noexcept {
+ for (ui32 i = 0; i < OptimizedBotIdToStoredBotId.size(); ++i) {
+ OptimizedBotIdToStoredBotId[i] = i; // by default, every bot maps to itself
+ }
+}
+
+void TRobotsTxtRulesHandlerBase::Clear() {
+ SiteMaps.clear();
+ CleanParams.clear();
+ HostDirective = "";
+ if (HandleErrors) {
+ AcceptedLines.clear();
+ CrossSectionAcceptedLines.clear();
+ Errors.clear();
+ }
+
+ for (size_t botId = 0; botId < BotIdToInfo.size(); ++botId) {
+ BotIdToInfo[botId].CrawlDelay = -1;
+ }
+
+ LoadedBotIds.clear();
+}
+
+void TRobotsTxtRulesHandlerBase::ClearInternal(const ui32 botId) {
+ CheckBotIdValidity(botId);
+ BotIdToInfo[botId].CrawlDelay = -1;
+
+ TVector<TBotIdAcceptedLine> newAcceptedLines;
+ for (size_t i = 0; i < AcceptedLines.size(); ++i)
+ if (AcceptedLines[i].first != botId)
+ newAcceptedLines.push_back(AcceptedLines[i]);
+
+ AcceptedLines.swap(newAcceptedLines);
+}
+
+int TRobotsTxtRulesHandlerBase::CheckHost(const char* host) {
+ THttpURL parsed;
+ TString copyHost = host;
+
+ if (GetHttpPrefixSize(copyHost) == 0) {
+ copyHost = TString("http://") + copyHost;
+ }
+
+ return parsed.Parse(copyHost.data(), THttpURL::FeaturesRobot) == THttpURL::ParsedOK && parsed.GetField(THttpURL::FieldHost) != TString("");
+}
+
+int TRobotsTxtRulesHandlerBase::CheckSitemapUrl(const char* url, const char* host, TString& modifiedUrl) {
+ if (host != nullptr && strlen(url) > 0 && url[0] == '/') {
+ modifiedUrl = TString(host) + url;
+ } else {
+ modifiedUrl = url;
+ }
+
+ url = modifiedUrl.data();
+
+ if (strlen(url) >= URL_MAX - 8)
+ return 0;
+ THttpURL parsed;
+ if (parsed.Parse(url, THttpURL::FeaturesRobot) || !parsed.IsValidAbs())
+ return 0;
+ if (parsed.GetScheme() != THttpURL::SchemeHTTP && parsed.GetScheme() != THttpURL::SchemeHTTPS)
+ return 0;
+ return CheckHost(parsed.PrintS(THttpURL::FlagHostPort).data());
+}
+
+// s - is space separated pair of clean-params (separated by &) and path prefix
+int TRobotsTxtRulesHandlerBase::CheckAndNormCleanParam(TString& value) {
+ if (value.find(' ') == TString::npos) {
+ value.push_back(' ');
+ }
+
+ const char* s = value.data();
+ if (!s || !*s || strlen(s) > URL_MAX / 2 - 9)
+ return 0;
+ const char* p = s;
+ while (*p && !isspace(*p))
+ ++p;
+ for (; s != p; ++s) {
+ // allowed only following not alpha-numerical symbols
+ if (!isalnum(*s) && !strchr("+-=_&%[]{}():.", *s))
+ return 0;
+ // clean-params for prefix can be enumerated by & symbol, && not allowed syntax
+ if (*s == '&' && *(s + 1) == '&')
+ return 0;
+ }
+ const char* pathPrefix = p + 1;
+ while (isspace(*p))
+ ++p;
+ char r[URL_MAX];
+ char* pr = r;
+ for (; *p; ++p) {
+ if (!isalnum(*p) && !strchr(".-/*_,;:%", *p))
+ return 0;
+ if (*p == '*')
+ *pr++ = '.';
+ if (*p == '.')
+ *pr++ = '\\';
+ *pr++ = *p;
+ }
+ *pr++ = '.';
+ *pr++ = '*';
+ *pr = 0;
+ TString params = value.substr(0, pathPrefix - value.data());
+ value = params + r;
+ return 1;
+}
+
+int TRobotsTxtRulesHandlerBase::ParseCrawlDelay(const char* value, int& crawlDelay) {
+ static const int MAX_CRAWL_DELAY = 1 << 10;
+ int val = 0;
+ const char* p = value;
+ for (; isdigit(*p); ++p) {
+ val = val * 10 + *p - '0';
+ if (val > MAX_CRAWL_DELAY)
+ return 0;
+ }
+ if (*p) {
+ if (*p++ != '.')
+ return 0;
+ if (strspn(p, "1234567890") != strlen(p))
+ return 0;
+ }
+ for (const char* s = p; s - p < 3; ++s)
+ val = val * 10 + (s < p + strlen(p) ? *s - '0' : 0);
+ crawlDelay = val;
+ return 1;
+}
+
+bool TRobotsTxtRulesHandlerBase::AddRuleWithErrorCheck(const ui32 botId, TStringBuf rule, char type, TRobotsTxtParser& parser) {
+ if (!IsBotIdSupported(botId))
+ return true;
+
+ if (!AddRule(botId, rule, type)) {
+ AddError(ERROR_ROBOTS_HUGE, parser.GetLineNumber());
+ AfterParse(botId);
+ return false;
+ }
+ return true;
+}
+
+int TRobotsTxtRulesHandlerBase::OnHost(const ui32 botId, TRobotsTxtParser& parser, const char* value, TRobotsTxtRulesHandlerBase*& rulesHandler) {
+ // Temporary hack for correct repacking robots.txt from new format to old
+ // Remove it, when robot-stable-2010-10-17 will be deployed in production
+ if (!IsBotIdSupported(botId))
+ return 0;
+ // end of hack
+
+ if (rulesHandler->HostDirective != "")
+ rulesHandler->AddError(ERROR_HOST_MULTI, parser.GetLineNumber());
+ else {
+ if (!CheckHost(value))
+ rulesHandler->AddError(ERROR_HOST_FORMAT, parser.GetLineNumber());
+ else {
+ rulesHandler->SetHostDirective(value);
+ if (!rulesHandler->AddRuleWithErrorCheck(botId, value, 'H', parser))
+ return 2;
+ }
+ }
+ return 0;
+}
+
+bool TRobotsTxtRulesHandlerBase::IsBotIdLoaded(const ui32 botId) const {
+ return LoadedBotIds.contains(botId);
+}
+
+bool TRobotsTxtRulesHandlerBase::IsBotIdSupported(const ui32 botId) const {
+ return (SaveDataForAnyBot && botId == robotstxtcfg::id_anybot) || SupportedBotIds.contains(botId);
+}
+
+ui32 TRobotsTxtRulesHandlerBase::GetNotOptimizedBotId(const ui32 botId) const {
+ return (botId < OptimizedBotIdToStoredBotId.size())
+ ? OptimizedBotIdToStoredBotId[botId]
+ : botId;
+}
+
+TMaybe<ui32> TRobotsTxtRulesHandlerBase::GetMappedBotId(ui32 botId, bool useAny) const {
+ botId = GetNotOptimizedBotId(botId);
+ CheckBotIdValidity(botId);
+ if (IsBotIdLoaded(botId))
+ return botId;
+ if (useAny)
+ return robotstxtcfg::id_anybot;
+ return {};
+}
diff --git a/library/cpp/robots_txt/ya.make b/library/cpp/robots_txt/ya.make
new file mode 100644
index 0000000000..c12b57ea04
--- /dev/null
+++ b/library/cpp/robots_txt/ya.make
@@ -0,0 +1,18 @@
+LIBRARY()
+
+SRCS(
+ prefix_tree.cpp
+ prefix_tree_rules_handler.cpp
+ robots_txt_parser.cpp
+ rules_handler.cpp
+)
+
+PEERDIR(
+ library/cpp/robots_txt/robotstxtcfg
+ library/cpp/case_insensitive_string
+ library/cpp/charset
+ library/cpp/string_utils/url
+ library/cpp/uri
+)
+
+END()
diff --git a/library/cpp/yconf/CMakeLists.darwin-x86_64.txt b/library/cpp/yconf/CMakeLists.darwin-x86_64.txt
new file mode 100644
index 0000000000..4e5bbf836d
--- /dev/null
+++ b/library/cpp/yconf/CMakeLists.darwin-x86_64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-yconf)
+target_link_libraries(library-cpp-yconf PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+ library-cpp-logger
+)
+target_sources(library-cpp-yconf PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/yconf/conf.cpp
+)
diff --git a/library/cpp/yconf/CMakeLists.linux-aarch64.txt b/library/cpp/yconf/CMakeLists.linux-aarch64.txt
new file mode 100644
index 0000000000..8ddf881133
--- /dev/null
+++ b/library/cpp/yconf/CMakeLists.linux-aarch64.txt
@@ -0,0 +1,20 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-yconf)
+target_link_libraries(library-cpp-yconf PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+ library-cpp-logger
+)
+target_sources(library-cpp-yconf PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/yconf/conf.cpp
+)
diff --git a/library/cpp/yconf/CMakeLists.linux-x86_64.txt b/library/cpp/yconf/CMakeLists.linux-x86_64.txt
new file mode 100644
index 0000000000..8ddf881133
--- /dev/null
+++ b/library/cpp/yconf/CMakeLists.linux-x86_64.txt
@@ -0,0 +1,20 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-yconf)
+target_link_libraries(library-cpp-yconf PUBLIC
+ contrib-libs-linux-headers
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+ library-cpp-logger
+)
+target_sources(library-cpp-yconf PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/yconf/conf.cpp
+)
diff --git a/library/cpp/yconf/CMakeLists.txt b/library/cpp/yconf/CMakeLists.txt
new file mode 100644
index 0000000000..f8b31df0c1
--- /dev/null
+++ b/library/cpp/yconf/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-aarch64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ include(CMakeLists.darwin-x86_64.txt)
+elseif (WIN32 AND CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" AND NOT HAVE_CUDA)
+ include(CMakeLists.windows-x86_64.txt)
+elseif (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT HAVE_CUDA)
+ include(CMakeLists.linux-x86_64.txt)
+endif()
diff --git a/library/cpp/yconf/CMakeLists.windows-x86_64.txt b/library/cpp/yconf/CMakeLists.windows-x86_64.txt
new file mode 100644
index 0000000000..4e5bbf836d
--- /dev/null
+++ b/library/cpp/yconf/CMakeLists.windows-x86_64.txt
@@ -0,0 +1,19 @@
+
+# This file was generated by the build system used internally in the Yandex monorepo.
+# Only simple modifications are allowed (adding source-files to targets, adding simple properties
+# like target_include_directories). These modifications will be ported to original
+# ya.make files by maintainers. Any complex modifications which can't be ported back to the
+# original buildsystem will not be accepted.
+
+
+
+add_library(library-cpp-yconf)
+target_link_libraries(library-cpp-yconf PUBLIC
+ contrib-libs-cxxsupp
+ yutil
+ library-cpp-charset
+ library-cpp-logger
+)
+target_sources(library-cpp-yconf PRIVATE
+ ${CMAKE_SOURCE_DIR}/library/cpp/yconf/conf.cpp
+)