diff options
author | heretic <heretic@yandex-team.com> | 2022-09-01 11:18:57 +0300 |
---|---|---|
committer | heretic <heretic@yandex-team.com> | 2022-09-01 11:18:57 +0300 |
commit | 8393683e8cb62468ccace14fa3379e3a4fbdde73 (patch) | |
tree | 4f2d32a77665019c9491d34dbe1cc5e605bb220c /contrib | |
parent | 836e587fc927c87149f8f0b2676d2587e6a79111 (diff) | |
download | ydb-8393683e8cb62468ccace14fa3379e3a4fbdde73.tar.gz |
add apache arrow python
Diffstat (limited to 'contrib')
39 files changed, 11481 insertions, 3 deletions
diff --git a/contrib/libs/apache/arrow/CMakeLists.txt b/contrib/libs/apache/arrow/CMakeLists.txt index eb1eebea7e..bae344e8b2 100644 --- a/contrib/libs/apache/arrow/CMakeLists.txt +++ b/contrib/libs/apache/arrow/CMakeLists.txt @@ -38,6 +38,7 @@ target_include_directories(libs-apache-arrow PRIVATE ${CMAKE_SOURCE_DIR}/contrib/libs/apache/orc/c++/include ${CMAKE_SOURCE_DIR}/contrib/libs/flatbuffers/include ${CMAKE_SOURCE_DIR}/contrib/libs/lz4 + ${CMAKE_SOURCE_DIR}/contrib/libs/rapidjson/include ${CMAKE_SOURCE_DIR}/contrib/libs/re2 ${CMAKE_SOURCE_DIR}/contrib/libs/utf8proc ${CMAKE_SOURCE_DIR}/contrib/libs/zstd/include @@ -50,6 +51,7 @@ target_link_libraries(libs-apache-arrow PUBLIC libs-brotli-enc contrib-libs-double-conversion contrib-libs-lz4 + contrib-libs-rapidjson contrib-libs-re2 contrib-libs-snappy contrib-libs-utf8proc @@ -146,6 +148,11 @@ target_sources(libs-apache-arrow PRIVATE ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/device.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/extension_type.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/filesystem.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/localfs.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/mockfs.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/path_util.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/util_internal.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc @@ -157,11 +164,20 @@ target_sources(libs-apache-arrow PRIVATE ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/json/chunked_builder.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/json/chunker.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/json/converter.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/json/object_parser.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/json/object_writer.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/json/options.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/json/parser.cc + ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/json/reader.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc ${CMAKE_SOURCE_DIR}/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/filesystem.cc b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/filesystem.cc new file mode 100644 index 0000000000..4f44e24ba6 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/filesystem.cc @@ -0,0 +1,761 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <sstream> +#include <utility> + +#include "arrow/util/config.h" + +#include "arrow/filesystem/filesystem.h" +#ifdef ARROW_HDFS +#error #include "arrow/filesystem/hdfs.h" +#endif +#ifdef ARROW_S3 +#error #include "arrow/filesystem/s3fs.h" +#endif +#include "arrow/filesystem/localfs.h" +#include "arrow/filesystem/mockfs.h" +#include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/util_internal.h" +#include "arrow/io/slow.h" +#include "arrow/io/util_internal.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" +#include "arrow/util/parallel.h" +#include "arrow/util/uri.h" +#include "arrow/util/vector.h" +#include "arrow/util/windows_fixup.h" + +namespace arrow { + +using internal::checked_pointer_cast; +using internal::TaskHints; +using internal::Uri; +using io::internal::SubmitIO; + +namespace fs { + +using internal::ConcatAbstractPath; +using internal::EnsureTrailingSlash; +using internal::GetAbstractPathParent; +using internal::kSep; +using internal::RemoveLeadingSlash; +using internal::RemoveTrailingSlash; +using internal::ToSlashes; + +std::string ToString(FileType ftype) { + switch (ftype) { + case FileType::NotFound: + return "not-found"; + case FileType::Unknown: + return "unknown"; + case FileType::File: + return "file"; + case FileType::Directory: + return "directory"; + default: + ARROW_LOG(FATAL) << "Invalid FileType value: " << static_cast<int>(ftype); + return "???"; + } +} + +// For googletest +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType ftype) { +#define FILE_TYPE_CASE(value_name) \ + case FileType::value_name: \ + os << "FileType::" ARROW_STRINGIFY(value_name); \ + break; + + switch (ftype) { + FILE_TYPE_CASE(NotFound) + FILE_TYPE_CASE(Unknown) + FILE_TYPE_CASE(File) + FILE_TYPE_CASE(Directory) + default: + ARROW_LOG(FATAL) << "Invalid FileType value: " << static_cast<int>(ftype); + } + +#undef FILE_TYPE_CASE + return os; +} + +std::string FileInfo::base_name() const { + return internal::GetAbstractPathParent(path_).second; +} + +std::string FileInfo::dir_name() const { + return internal::GetAbstractPathParent(path_).first; +} + +// Debug helper +std::string FileInfo::ToString() const { + std::stringstream os; + os << *this; + return os.str(); +} + +std::ostream& operator<<(std::ostream& os, const FileInfo& info) { + return os << "FileInfo(" << info.type() << ", " << info.path() << ")"; +} + +std::string FileInfo::extension() const { + return internal::GetAbstractPathExtension(path_); +} + +////////////////////////////////////////////////////////////////////////// +// FileSystem default method implementations + +FileSystem::~FileSystem() {} + +Result<std::string> FileSystem::NormalizePath(std::string path) { return path; } + +Result<std::vector<FileInfo>> FileSystem::GetFileInfo( + const std::vector<std::string>& paths) { + std::vector<FileInfo> res; + res.reserve(paths.size()); + for (const auto& path : paths) { + ARROW_ASSIGN_OR_RAISE(FileInfo info, GetFileInfo(path)); + res.push_back(std::move(info)); + } + return res; +} + +namespace { + +template <typename DeferredFunc> +auto FileSystemDefer(FileSystem* fs, bool synchronous, DeferredFunc&& func) + -> decltype(DeferNotOk( + fs->io_context().executor()->Submit(func, std::shared_ptr<FileSystem>{}))) { + auto self = fs->shared_from_this(); + if (synchronous) { + return std::forward<DeferredFunc>(func)(std::move(self)); + } + return DeferNotOk(io::internal::SubmitIO( + fs->io_context(), std::forward<DeferredFunc>(func), std::move(self))); +} + +} // namespace + +Future<std::vector<FileInfo>> FileSystem::GetFileInfoAsync( + const std::vector<std::string>& paths) { + return FileSystemDefer( + this, default_async_is_sync_, + [paths](std::shared_ptr<FileSystem> self) { return self->GetFileInfo(paths); }); +} + +FileInfoGenerator FileSystem::GetFileInfoGenerator(const FileSelector& select) { + auto fut = FileSystemDefer( + this, default_async_is_sync_, + [select](std::shared_ptr<FileSystem> self) { return self->GetFileInfo(select); }); + return MakeSingleFutureGenerator(std::move(fut)); +} + +Status FileSystem::DeleteFiles(const std::vector<std::string>& paths) { + Status st = Status::OK(); + for (const auto& path : paths) { + st &= DeleteFile(path); + } + return st; +} + +namespace { + +Status ValidateInputFileInfo(const FileInfo& info) { + if (info.type() == FileType::NotFound) { + return internal::PathNotFound(info.path()); + } + if (info.type() != FileType::File && info.type() != FileType::Unknown) { + return internal::NotAFile(info.path()); + } + return Status::OK(); +} + +} // namespace + +Result<std::shared_ptr<io::InputStream>> FileSystem::OpenInputStream( + const FileInfo& info) { + RETURN_NOT_OK(ValidateInputFileInfo(info)); + return OpenInputStream(info.path()); +} + +Result<std::shared_ptr<io::RandomAccessFile>> FileSystem::OpenInputFile( + const FileInfo& info) { + RETURN_NOT_OK(ValidateInputFileInfo(info)); + return OpenInputFile(info.path()); +} + +Future<std::shared_ptr<io::InputStream>> FileSystem::OpenInputStreamAsync( + const std::string& path) { + return FileSystemDefer( + this, default_async_is_sync_, + [path](std::shared_ptr<FileSystem> self) { return self->OpenInputStream(path); }); +} + +Future<std::shared_ptr<io::InputStream>> FileSystem::OpenInputStreamAsync( + const FileInfo& info) { + RETURN_NOT_OK(ValidateInputFileInfo(info)); + return FileSystemDefer( + this, default_async_is_sync_, + [info](std::shared_ptr<FileSystem> self) { return self->OpenInputStream(info); }); +} + +Future<std::shared_ptr<io::RandomAccessFile>> FileSystem::OpenInputFileAsync( + const std::string& path) { + return FileSystemDefer( + this, default_async_is_sync_, + [path](std::shared_ptr<FileSystem> self) { return self->OpenInputFile(path); }); +} + +Future<std::shared_ptr<io::RandomAccessFile>> FileSystem::OpenInputFileAsync( + const FileInfo& info) { + RETURN_NOT_OK(ValidateInputFileInfo(info)); + return FileSystemDefer( + this, default_async_is_sync_, + [info](std::shared_ptr<FileSystem> self) { return self->OpenInputFile(info); }); +} + +Result<std::shared_ptr<io::OutputStream>> FileSystem::OpenOutputStream( + const std::string& path) { + return OpenOutputStream(path, std::shared_ptr<const KeyValueMetadata>{}); +} + +Result<std::shared_ptr<io::OutputStream>> FileSystem::OpenAppendStream( + const std::string& path) { + return OpenAppendStream(path, std::shared_ptr<const KeyValueMetadata>{}); +} + +////////////////////////////////////////////////////////////////////////// +// SubTreeFileSystem implementation + +SubTreeFileSystem::SubTreeFileSystem(const std::string& base_path, + std::shared_ptr<FileSystem> base_fs) + : FileSystem(base_fs->io_context()), + base_path_(NormalizeBasePath(base_path, base_fs).ValueOrDie()), + base_fs_(base_fs) {} + +SubTreeFileSystem::~SubTreeFileSystem() {} + +Result<std::string> SubTreeFileSystem::NormalizeBasePath( + std::string base_path, const std::shared_ptr<FileSystem>& base_fs) { + ARROW_ASSIGN_OR_RAISE(base_path, base_fs->NormalizePath(std::move(base_path))); + return EnsureTrailingSlash(std::move(base_path)); +} + +bool SubTreeFileSystem::Equals(const FileSystem& other) const { + if (this == &other) { + return true; + } + if (other.type_name() != type_name()) { + return false; + } + const auto& subfs = ::arrow::internal::checked_cast<const SubTreeFileSystem&>(other); + return base_path_ == subfs.base_path_ && base_fs_->Equals(subfs.base_fs_); +} + +std::string SubTreeFileSystem::PrependBase(const std::string& s) const { + if (s.empty()) { + return base_path_; + } else { + return ConcatAbstractPath(base_path_, s); + } +} + +Status SubTreeFileSystem::PrependBaseNonEmpty(std::string* s) const { + if (s->empty()) { + return Status::IOError("Empty path"); + } else { + *s = ConcatAbstractPath(base_path_, *s); + return Status::OK(); + } +} + +Result<std::string> SubTreeFileSystem::StripBase(const std::string& s) const { + auto len = base_path_.length(); + // Note base_path_ ends with a slash (if not empty) + if (s.length() >= len && s.substr(0, len) == base_path_) { + return s.substr(len); + } else { + return Status::UnknownError("Underlying filesystem returned path '", s, + "', which is not a subpath of '", base_path_, "'"); + } +} + +Status SubTreeFileSystem::FixInfo(FileInfo* info) const { + ARROW_ASSIGN_OR_RAISE(auto fixed_path, StripBase(info->path())); + info->set_path(std::move(fixed_path)); + return Status::OK(); +} + +Result<std::string> SubTreeFileSystem::NormalizePath(std::string path) { + ARROW_ASSIGN_OR_RAISE(auto normalized, base_fs_->NormalizePath(PrependBase(path))); + return StripBase(std::move(normalized)); +} + +Result<FileInfo> SubTreeFileSystem::GetFileInfo(const std::string& path) { + ARROW_ASSIGN_OR_RAISE(FileInfo info, base_fs_->GetFileInfo(PrependBase(path))); + RETURN_NOT_OK(FixInfo(&info)); + return info; +} + +Result<std::vector<FileInfo>> SubTreeFileSystem::GetFileInfo(const FileSelector& select) { + auto selector = select; + selector.base_dir = PrependBase(selector.base_dir); + ARROW_ASSIGN_OR_RAISE(auto infos, base_fs_->GetFileInfo(selector)); + for (auto& info : infos) { + RETURN_NOT_OK(FixInfo(&info)); + } + return infos; +} + +FileInfoGenerator SubTreeFileSystem::GetFileInfoGenerator(const FileSelector& select) { + auto selector = select; + selector.base_dir = PrependBase(selector.base_dir); + auto gen = base_fs_->GetFileInfoGenerator(selector); + + auto self = checked_pointer_cast<SubTreeFileSystem>(shared_from_this()); + + std::function<Result<std::vector<FileInfo>>(const std::vector<FileInfo>& infos)> + fix_infos = [self](std::vector<FileInfo> infos) -> Result<std::vector<FileInfo>> { + for (auto& info : infos) { + RETURN_NOT_OK(self->FixInfo(&info)); + } + return infos; + }; + return MakeMappedGenerator(gen, fix_infos); +} + +Status SubTreeFileSystem::CreateDir(const std::string& path, bool recursive) { + auto s = path; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + return base_fs_->CreateDir(s, recursive); +} + +Status SubTreeFileSystem::DeleteDir(const std::string& path) { + auto s = path; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + return base_fs_->DeleteDir(s); +} + +Status SubTreeFileSystem::DeleteDirContents(const std::string& path) { + if (internal::IsEmptyPath(path)) { + return internal::InvalidDeleteDirContents(path); + } + auto s = PrependBase(path); + return base_fs_->DeleteDirContents(s); +} + +Status SubTreeFileSystem::DeleteRootDirContents() { + if (base_path_.empty()) { + return base_fs_->DeleteRootDirContents(); + } else { + return base_fs_->DeleteDirContents(base_path_); + } +} + +Status SubTreeFileSystem::DeleteFile(const std::string& path) { + auto s = path; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + return base_fs_->DeleteFile(s); +} + +Status SubTreeFileSystem::Move(const std::string& src, const std::string& dest) { + auto s = src; + auto d = dest; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + RETURN_NOT_OK(PrependBaseNonEmpty(&d)); + return base_fs_->Move(s, d); +} + +Status SubTreeFileSystem::CopyFile(const std::string& src, const std::string& dest) { + auto s = src; + auto d = dest; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + RETURN_NOT_OK(PrependBaseNonEmpty(&d)); + return base_fs_->CopyFile(s, d); +} + +Result<std::shared_ptr<io::InputStream>> SubTreeFileSystem::OpenInputStream( + const std::string& path) { + auto s = path; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + return base_fs_->OpenInputStream(s); +} + +Result<std::shared_ptr<io::InputStream>> SubTreeFileSystem::OpenInputStream( + const FileInfo& info) { + auto s = info.path(); + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + FileInfo new_info(info); + new_info.set_path(std::move(s)); + return base_fs_->OpenInputStream(new_info); +} + +Future<std::shared_ptr<io::InputStream>> SubTreeFileSystem::OpenInputStreamAsync( + const std::string& path) { + auto s = path; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + return base_fs_->OpenInputStreamAsync(s); +} + +Future<std::shared_ptr<io::InputStream>> SubTreeFileSystem::OpenInputStreamAsync( + const FileInfo& info) { + auto s = info.path(); + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + FileInfo new_info(info); + new_info.set_path(std::move(s)); + return base_fs_->OpenInputStreamAsync(new_info); +} + +Result<std::shared_ptr<io::RandomAccessFile>> SubTreeFileSystem::OpenInputFile( + const std::string& path) { + auto s = path; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + return base_fs_->OpenInputFile(s); +} + +Result<std::shared_ptr<io::RandomAccessFile>> SubTreeFileSystem::OpenInputFile( + const FileInfo& info) { + auto s = info.path(); + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + FileInfo new_info(info); + new_info.set_path(std::move(s)); + return base_fs_->OpenInputFile(new_info); +} + +Future<std::shared_ptr<io::RandomAccessFile>> SubTreeFileSystem::OpenInputFileAsync( + const std::string& path) { + auto s = path; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + return base_fs_->OpenInputFileAsync(s); +} + +Future<std::shared_ptr<io::RandomAccessFile>> SubTreeFileSystem::OpenInputFileAsync( + const FileInfo& info) { + auto s = info.path(); + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + FileInfo new_info(info); + new_info.set_path(std::move(s)); + return base_fs_->OpenInputFileAsync(new_info); +} + +Result<std::shared_ptr<io::OutputStream>> SubTreeFileSystem::OpenOutputStream( + const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) { + auto s = path; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + return base_fs_->OpenOutputStream(s, metadata); +} + +Result<std::shared_ptr<io::OutputStream>> SubTreeFileSystem::OpenAppendStream( + const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) { + auto s = path; + RETURN_NOT_OK(PrependBaseNonEmpty(&s)); + return base_fs_->OpenAppendStream(s, metadata); +} + +////////////////////////////////////////////////////////////////////////// +// SlowFileSystem implementation + +SlowFileSystem::SlowFileSystem(std::shared_ptr<FileSystem> base_fs, + std::shared_ptr<io::LatencyGenerator> latencies) + : FileSystem(base_fs->io_context()), base_fs_(base_fs), latencies_(latencies) {} + +SlowFileSystem::SlowFileSystem(std::shared_ptr<FileSystem> base_fs, + double average_latency) + : FileSystem(base_fs->io_context()), + base_fs_(base_fs), + latencies_(io::LatencyGenerator::Make(average_latency)) {} + +SlowFileSystem::SlowFileSystem(std::shared_ptr<FileSystem> base_fs, + double average_latency, int32_t seed) + : FileSystem(base_fs->io_context()), + base_fs_(base_fs), + latencies_(io::LatencyGenerator::Make(average_latency, seed)) {} + +bool SlowFileSystem::Equals(const FileSystem& other) const { return this == &other; } + +Result<FileInfo> SlowFileSystem::GetFileInfo(const std::string& path) { + latencies_->Sleep(); + return base_fs_->GetFileInfo(path); +} + +Result<std::vector<FileInfo>> SlowFileSystem::GetFileInfo(const FileSelector& selector) { + latencies_->Sleep(); + return base_fs_->GetFileInfo(selector); +} + +Status SlowFileSystem::CreateDir(const std::string& path, bool recursive) { + latencies_->Sleep(); + return base_fs_->CreateDir(path, recursive); +} + +Status SlowFileSystem::DeleteDir(const std::string& path) { + latencies_->Sleep(); + return base_fs_->DeleteDir(path); +} + +Status SlowFileSystem::DeleteDirContents(const std::string& path) { + latencies_->Sleep(); + return base_fs_->DeleteDirContents(path); +} + +Status SlowFileSystem::DeleteRootDirContents() { + latencies_->Sleep(); + return base_fs_->DeleteRootDirContents(); +} + +Status SlowFileSystem::DeleteFile(const std::string& path) { + latencies_->Sleep(); + return base_fs_->DeleteFile(path); +} + +Status SlowFileSystem::Move(const std::string& src, const std::string& dest) { + latencies_->Sleep(); + return base_fs_->Move(src, dest); +} + +Status SlowFileSystem::CopyFile(const std::string& src, const std::string& dest) { + latencies_->Sleep(); + return base_fs_->CopyFile(src, dest); +} + +Result<std::shared_ptr<io::InputStream>> SlowFileSystem::OpenInputStream( + const std::string& path) { + latencies_->Sleep(); + ARROW_ASSIGN_OR_RAISE(auto stream, base_fs_->OpenInputStream(path)); + return std::make_shared<io::SlowInputStream>(stream, latencies_); +} + +Result<std::shared_ptr<io::InputStream>> SlowFileSystem::OpenInputStream( + const FileInfo& info) { + latencies_->Sleep(); + ARROW_ASSIGN_OR_RAISE(auto stream, base_fs_->OpenInputStream(info)); + return std::make_shared<io::SlowInputStream>(stream, latencies_); +} + +Result<std::shared_ptr<io::RandomAccessFile>> SlowFileSystem::OpenInputFile( + const std::string& path) { + latencies_->Sleep(); + ARROW_ASSIGN_OR_RAISE(auto file, base_fs_->OpenInputFile(path)); + return std::make_shared<io::SlowRandomAccessFile>(file, latencies_); +} + +Result<std::shared_ptr<io::RandomAccessFile>> SlowFileSystem::OpenInputFile( + const FileInfo& info) { + latencies_->Sleep(); + ARROW_ASSIGN_OR_RAISE(auto file, base_fs_->OpenInputFile(info)); + return std::make_shared<io::SlowRandomAccessFile>(file, latencies_); +} + +Result<std::shared_ptr<io::OutputStream>> SlowFileSystem::OpenOutputStream( + const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) { + latencies_->Sleep(); + // XXX Should we have a SlowOutputStream that waits on Flush() and Close()? + return base_fs_->OpenOutputStream(path, metadata); +} + +Result<std::shared_ptr<io::OutputStream>> SlowFileSystem::OpenAppendStream( + const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) { + latencies_->Sleep(); + return base_fs_->OpenAppendStream(path, metadata); +} + +Status CopyFiles(const std::vector<FileLocator>& sources, + const std::vector<FileLocator>& destinations, + const io::IOContext& io_context, int64_t chunk_size, bool use_threads) { + if (sources.size() != destinations.size()) { + return Status::Invalid("Trying to copy ", sources.size(), " files into ", + destinations.size(), " paths."); + } + + auto copy_one_file = [&](int i) { + if (sources[i].filesystem->Equals(destinations[i].filesystem)) { + return sources[i].filesystem->CopyFile(sources[i].path, destinations[i].path); + } + + ARROW_ASSIGN_OR_RAISE(auto source, + sources[i].filesystem->OpenInputStream(sources[i].path)); + ARROW_ASSIGN_OR_RAISE(const auto metadata, source->ReadMetadata()); + + ARROW_ASSIGN_OR_RAISE(auto destination, destinations[i].filesystem->OpenOutputStream( + destinations[i].path, metadata)); + RETURN_NOT_OK(internal::CopyStream(source, destination, chunk_size, io_context)); + return destination->Close(); + }; + + return ::arrow::internal::OptionalParallelFor( + use_threads, static_cast<int>(sources.size()), std::move(copy_one_file), + io_context.executor()); +} + +Status CopyFiles(const std::shared_ptr<FileSystem>& source_fs, + const FileSelector& source_sel, + const std::shared_ptr<FileSystem>& destination_fs, + const std::string& destination_base_dir, const io::IOContext& io_context, + int64_t chunk_size, bool use_threads) { + ARROW_ASSIGN_OR_RAISE(auto source_infos, source_fs->GetFileInfo(source_sel)); + if (source_infos.empty()) { + return Status::OK(); + } + + std::vector<FileLocator> sources, destinations; + std::vector<std::string> dirs; + + for (const FileInfo& source_info : source_infos) { + auto relative = internal::RemoveAncestor(source_sel.base_dir, source_info.path()); + if (!relative.has_value()) { + return Status::Invalid("GetFileInfo() yielded path '", source_info.path(), + "', which is outside base dir '", source_sel.base_dir, "'"); + } + + auto destination_path = + internal::ConcatAbstractPath(destination_base_dir, relative->to_string()); + + if (source_info.IsDirectory()) { + dirs.push_back(destination_path); + } else if (source_info.IsFile()) { + sources.push_back({source_fs, source_info.path()}); + destinations.push_back({destination_fs, destination_path}); + } + } + + auto create_one_dir = [&](int i) { return destination_fs->CreateDir(dirs[i]); }; + + dirs = internal::MinimalCreateDirSet(std::move(dirs)); + RETURN_NOT_OK(::arrow::internal::OptionalParallelFor( + use_threads, static_cast<int>(dirs.size()), std::move(create_one_dir), + io_context.executor())); + + return CopyFiles(sources, destinations, io_context, chunk_size, use_threads); +} + +namespace { + +Result<Uri> ParseFileSystemUri(const std::string& uri_string) { + Uri uri; + auto status = uri.Parse(uri_string); + if (!status.ok()) { +#ifdef _WIN32 + // Could be a "file:..." URI with backslashes instead of regular slashes. + RETURN_NOT_OK(uri.Parse(ToSlashes(uri_string))); + if (uri.scheme() != "file") { + return status; + } +#else + return status; +#endif + } + return std::move(uri); +} + +Result<std::shared_ptr<FileSystem>> FileSystemFromUriReal(const Uri& uri, + const std::string& uri_string, + const io::IOContext& io_context, + std::string* out_path) { + const auto scheme = uri.scheme(); + + if (scheme == "file") { + std::string path; + ARROW_ASSIGN_OR_RAISE(auto options, LocalFileSystemOptions::FromUri(uri, &path)); + if (out_path != nullptr) { + *out_path = path; + } + return std::make_shared<LocalFileSystem>(options, io_context); + } + if (scheme == "hdfs" || scheme == "viewfs") { +#ifdef ARROW_HDFS + ARROW_ASSIGN_OR_RAISE(auto options, HdfsOptions::FromUri(uri)); + if (out_path != nullptr) { + *out_path = uri.path(); + } + ARROW_ASSIGN_OR_RAISE(auto hdfs, HadoopFileSystem::Make(options, io_context)); + return hdfs; +#else + return Status::NotImplemented("Got HDFS URI but Arrow compiled without HDFS support"); +#endif + } + if (scheme == "s3") { +#ifdef ARROW_S3 + RETURN_NOT_OK(EnsureS3Initialized()); + ARROW_ASSIGN_OR_RAISE(auto options, S3Options::FromUri(uri, out_path)); + ARROW_ASSIGN_OR_RAISE(auto s3fs, S3FileSystem::Make(options, io_context)); + return s3fs; +#else + return Status::NotImplemented("Got S3 URI but Arrow compiled without S3 support"); +#endif + } + + if (scheme == "mock") { + // MockFileSystem does not have an absolute / relative path distinction, + // normalize path by removing leading slash. + if (out_path != nullptr) { + *out_path = std::string(RemoveLeadingSlash(uri.path())); + } + return std::make_shared<internal::MockFileSystem>(internal::CurrentTimePoint(), + io_context); + } + + return Status::Invalid("Unrecognized filesystem type in URI: ", uri_string); +} + +} // namespace + +Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri_string, + std::string* out_path) { + return FileSystemFromUri(uri_string, io::default_io_context(), out_path); +} + +Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri_string, + const io::IOContext& io_context, + std::string* out_path) { + ARROW_ASSIGN_OR_RAISE(auto fsuri, ParseFileSystemUri(uri_string)); + return FileSystemFromUriReal(fsuri, uri_string, io_context, out_path); +} + +Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(const std::string& uri_string, + std::string* out_path) { + return FileSystemFromUriOrPath(uri_string, io::default_io_context(), out_path); +} + +Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath( + const std::string& uri_string, const io::IOContext& io_context, + std::string* out_path) { + if (internal::DetectAbsolutePath(uri_string)) { + // Normalize path separators + if (out_path != nullptr) { + *out_path = ToSlashes(uri_string); + } + return std::make_shared<LocalFileSystem>(); + } + return FileSystemFromUri(uri_string, io_context, out_path); +} + +Status FileSystemFromUri(const std::string& uri, std::shared_ptr<FileSystem>* out_fs, + std::string* out_path) { + return FileSystemFromUri(uri, out_path).Value(out_fs); +} + +Status Initialize(const FileSystemGlobalOptions& options) { + internal::global_options = options; + return Status::OK(); +} + +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/filesystem.h b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/filesystem.h new file mode 100644 index 0000000000..c739471c72 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/filesystem.h @@ -0,0 +1,532 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <chrono> +#include <cstdint> +#include <functional> +#include <iosfwd> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#include "arrow/filesystem/type_fwd.h" +#include "arrow/io/interfaces.h" +#include "arrow/type_fwd.h" +#include "arrow/util/compare.h" +#include "arrow/util/macros.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" +#include "arrow/util/windows_fixup.h" + +namespace arrow { +namespace fs { + +// A system clock time point expressed as a 64-bit (or more) number of +// nanoseconds since the epoch. +using TimePoint = + std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>; + +ARROW_EXPORT std::string ToString(FileType); + +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType); + +static const int64_t kNoSize = -1; +static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1)); + +/// \brief FileSystem entry info +struct ARROW_EXPORT FileInfo : public util::EqualityComparable<FileInfo> { + FileInfo() = default; + FileInfo(FileInfo&&) = default; + FileInfo& operator=(FileInfo&&) = default; + FileInfo(const FileInfo&) = default; + FileInfo& operator=(const FileInfo&) = default; + + explicit FileInfo(std::string path, FileType type = FileType::Unknown) + : path_(std::move(path)), type_(type) {} + + /// The file type + FileType type() const { return type_; } + void set_type(FileType type) { type_ = type; } + + /// The full file path in the filesystem + const std::string& path() const { return path_; } + void set_path(std::string path) { path_ = std::move(path); } + + /// The file base name (component after the last directory separator) + std::string base_name() const; + + // The directory base name (component before the file base name). + std::string dir_name() const; + + /// The size in bytes, if available + /// + /// Only regular files are guaranteed to have a size. + int64_t size() const { return size_; } + void set_size(int64_t size) { size_ = size; } + + /// The file extension (excluding the dot) + std::string extension() const; + + /// The time of last modification, if available + TimePoint mtime() const { return mtime_; } + void set_mtime(TimePoint mtime) { mtime_ = mtime; } + + bool IsFile() const { return type_ == FileType::File; } + bool IsDirectory() const { return type_ == FileType::Directory; } + + bool Equals(const FileInfo& other) const { + return type() == other.type() && path() == other.path() && size() == other.size() && + mtime() == other.mtime(); + } + + std::string ToString() const; + + /// Function object implementing less-than comparison and hashing by + /// path, to support sorting infos, using them as keys, and other + /// interactions with the STL. + struct ByPath { + bool operator()(const FileInfo& l, const FileInfo& r) const { + return l.path() < r.path(); + } + + size_t operator()(const FileInfo& i) const { + return std::hash<std::string>{}(i.path()); + } + }; + + protected: + std::string path_; + FileType type_ = FileType::Unknown; + int64_t size_ = kNoSize; + TimePoint mtime_ = kNoTime; +}; + +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const FileInfo&); + +/// \brief File selector for filesystem APIs +struct ARROW_EXPORT FileSelector { + /// The directory in which to select files. + /// If the path exists but doesn't point to a directory, this should be an error. + std::string base_dir; + /// The behavior if `base_dir` isn't found in the filesystem. If false, + /// an error is returned. If true, an empty selection is returned. + bool allow_not_found; + /// Whether to recurse into subdirectories. + bool recursive; + /// The maximum number of subdirectories to recurse into. + int32_t max_recursion; + + FileSelector() : allow_not_found(false), recursive(false), max_recursion(INT32_MAX) {} +}; + +/// \brief FileSystem, path pair +struct ARROW_EXPORT FileLocator { + std::shared_ptr<FileSystem> filesystem; + std::string path; +}; + +using FileInfoVector = std::vector<FileInfo>; +using FileInfoGenerator = std::function<Future<FileInfoVector>()>; + +} // namespace fs + +template <> +struct IterationTraits<fs::FileInfoVector> { + static fs::FileInfoVector End() { return {}; } + static bool IsEnd(const fs::FileInfoVector& val) { return val.empty(); } +}; + +namespace fs { + +/// \brief Abstract file system API +class ARROW_EXPORT FileSystem : public std::enable_shared_from_this<FileSystem> { + public: + virtual ~FileSystem(); + + virtual std::string type_name() const = 0; + + /// EXPERIMENTAL: The IOContext associated with this filesystem. + const io::IOContext& io_context() const { return io_context_; } + + /// Normalize path for the given filesystem + /// + /// The default implementation of this method is a no-op, but subclasses + /// may allow normalizing irregular path forms (such as Windows local paths). + virtual Result<std::string> NormalizePath(std::string path); + + virtual bool Equals(const FileSystem& other) const = 0; + + virtual bool Equals(const std::shared_ptr<FileSystem>& other) const { + return Equals(*other); + } + + /// Get info for the given target. + /// + /// Any symlink is automatically dereferenced, recursively. + /// A nonexistent or unreachable file returns an Ok status and + /// has a FileType of value NotFound. An error status indicates + /// a truly exceptional condition (low-level I/O error, etc.). + virtual Result<FileInfo> GetFileInfo(const std::string& path) = 0; + /// Same, for many targets at once. + virtual Result<FileInfoVector> GetFileInfo(const std::vector<std::string>& paths); + /// Same, according to a selector. + /// + /// The selector's base directory will not be part of the results, even if + /// it exists. + /// If it doesn't exist, see `FileSelector::allow_not_found`. + virtual Result<FileInfoVector> GetFileInfo(const FileSelector& select) = 0; + + /// EXPERIMENTAL: async version of GetFileInfo + virtual Future<FileInfoVector> GetFileInfoAsync(const std::vector<std::string>& paths); + + /// EXPERIMENTAL: streaming async version of GetFileInfo + /// + /// The returned generator is not async-reentrant, i.e. you need to wait for + /// the returned future to complete before calling the generator again. + virtual FileInfoGenerator GetFileInfoGenerator(const FileSelector& select); + + /// Create a directory and subdirectories. + /// + /// This function succeeds if the directory already exists. + virtual Status CreateDir(const std::string& path, bool recursive = true) = 0; + + /// Delete a directory and its contents, recursively. + virtual Status DeleteDir(const std::string& path) = 0; + + /// Delete a directory's contents, recursively. + /// + /// Like DeleteDir, but doesn't delete the directory itself. + /// Passing an empty path ("" or "/") is disallowed, see DeleteRootDirContents. + virtual Status DeleteDirContents(const std::string& path) = 0; + + /// EXPERIMENTAL: Delete the root directory's contents, recursively. + /// + /// Implementations may decide to raise an error if this operation is + /// too dangerous. + // NOTE: may decide to remove this if it's deemed not useful + virtual Status DeleteRootDirContents() = 0; + + /// Delete a file. + virtual Status DeleteFile(const std::string& path) = 0; + /// Delete many files. + /// + /// The default implementation issues individual delete operations in sequence. + virtual Status DeleteFiles(const std::vector<std::string>& paths); + + /// Move / rename a file or directory. + /// + /// If the destination exists: + /// - if it is a non-empty directory, an error is returned + /// - otherwise, if it has the same type as the source, it is replaced + /// - otherwise, behavior is unspecified (implementation-dependent). + virtual Status Move(const std::string& src, const std::string& dest) = 0; + + /// Copy a file. + /// + /// If the destination exists and is a directory, an error is returned. + /// Otherwise, it is replaced. + virtual Status CopyFile(const std::string& src, const std::string& dest) = 0; + + /// Open an input stream for sequential reading. + virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream( + const std::string& path) = 0; + /// Open an input stream for sequential reading. + /// + /// This override assumes the given FileInfo validly represents the file's + /// characteristics, and may optimize access depending on them (for example + /// avoid querying the file size or its existence). + virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info); + + /// Open an input file for random access reading. + virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const std::string& path) = 0; + /// Open an input file for random access reading. + /// + /// This override assumes the given FileInfo validly represents the file's + /// characteristics, and may optimize access depending on them (for example + /// avoid querying the file size or its existence). + virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const FileInfo& info); + + /// EXPERIMENTAL: async version of OpenInputStream + virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync( + const std::string& path); + /// EXPERIMENTAL: async version of OpenInputStream + virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync( + const FileInfo& info); + + /// EXPERIMENTAL: async version of OpenInputFile + virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync( + const std::string& path); + /// EXPERIMENTAL: async version of OpenInputFile + virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync( + const FileInfo& info); + + /// Open an output stream for sequential writing. + /// + /// If the target already exists, existing data is truncated. + virtual Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata) = 0; + Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(const std::string& path); + + /// Open an output stream for appending. + /// + /// If the target doesn't exist, a new empty file is created. + virtual Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata) = 0; + Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(const std::string& path); + + protected: + explicit FileSystem(const io::IOContext& io_context = io::default_io_context()) + : io_context_(io_context) {} + + io::IOContext io_context_; + // Whether metadata operations (such as GetFileInfo or OpenInputStream) + // are cheap enough that the default async variants don't bother with + // a thread pool. + bool default_async_is_sync_ = true; +}; + +/// \brief A FileSystem implementation that delegates to another +/// implementation after prepending a fixed base path. +/// +/// This is useful to expose a logical view of a subtree of a filesystem, +/// for example a directory in a LocalFileSystem. +/// This works on abstract paths, i.e. paths using forward slashes and +/// and a single root "/". Windows paths are not guaranteed to work. +/// This makes no security guarantee. For example, symlinks may allow to +/// "escape" the subtree and access other parts of the underlying filesystem. +class ARROW_EXPORT SubTreeFileSystem : public FileSystem { + public: + // This constructor may abort if base_path is invalid. + explicit SubTreeFileSystem(const std::string& base_path, + std::shared_ptr<FileSystem> base_fs); + ~SubTreeFileSystem() override; + + std::string type_name() const override { return "subtree"; } + std::string base_path() const { return base_path_; } + std::shared_ptr<FileSystem> base_fs() const { return base_fs_; } + + Result<std::string> NormalizePath(std::string path) override; + + bool Equals(const FileSystem& other) const override; + + /// \cond FALSE + using FileSystem::GetFileInfo; + /// \endcond + Result<FileInfo> GetFileInfo(const std::string& path) override; + Result<FileInfoVector> GetFileInfo(const FileSelector& select) override; + + FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result<std::shared_ptr<io::InputStream>> OpenInputStream( + const std::string& path) override; + Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override; + Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const std::string& path) override; + Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const FileInfo& info) override; + + Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync( + const std::string& path) override; + Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync( + const FileInfo& info) override; + Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync( + const std::string& path) override; + Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync( + const FileInfo& info) override; + + Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override; + Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override; + + protected: + SubTreeFileSystem() {} + + const std::string base_path_; + std::shared_ptr<FileSystem> base_fs_; + + std::string PrependBase(const std::string& s) const; + Status PrependBaseNonEmpty(std::string* s) const; + Result<std::string> StripBase(const std::string& s) const; + Status FixInfo(FileInfo* info) const; + + static Result<std::string> NormalizeBasePath( + std::string base_path, const std::shared_ptr<FileSystem>& base_fs); +}; + +/// \brief A FileSystem implementation that delegates to another +/// implementation but inserts latencies at various points. +class ARROW_EXPORT SlowFileSystem : public FileSystem { + public: + SlowFileSystem(std::shared_ptr<FileSystem> base_fs, + std::shared_ptr<io::LatencyGenerator> latencies); + SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency); + SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency, + int32_t seed); + + std::string type_name() const override { return "slow"; } + bool Equals(const FileSystem& other) const override; + + using FileSystem::GetFileInfo; + Result<FileInfo> GetFileInfo(const std::string& path) override; + Result<FileInfoVector> GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result<std::shared_ptr<io::InputStream>> OpenInputStream( + const std::string& path) override; + Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override; + Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const std::string& path) override; + Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const FileInfo& info) override; + Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override; + Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override; + + protected: + std::shared_ptr<FileSystem> base_fs_; + std::shared_ptr<io::LatencyGenerator> latencies_; +}; + +/// \defgroup filesystem-factories Functions for creating FileSystem instances +/// +/// @{ + +/// \brief Create a new FileSystem by URI +/// +/// Recognized schemes are "file", "mock", "hdfs" and "s3fs". +/// +/// \param[in] uri a URI-based path, ex: file:///some/local/path +/// \param[out] out_path (optional) Path inside the filesystem. +/// \return out_fs FileSystem instance. +ARROW_EXPORT +Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri, + std::string* out_path = NULLPTR); + +/// \brief Create a new FileSystem by URI with a custom IO context +/// +/// Recognized schemes are "file", "mock", "hdfs" and "s3fs". +/// +/// \param[in] uri a URI-based path, ex: file:///some/local/path +/// \param[in] io_context an IOContext which will be associated with the filesystem +/// \param[out] out_path (optional) Path inside the filesystem. +/// \return out_fs FileSystem instance. +ARROW_EXPORT +Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri, + const io::IOContext& io_context, + std::string* out_path = NULLPTR); + +/// \brief Create a new FileSystem by URI +/// +/// Same as FileSystemFromUri, but in addition also recognize non-URIs +/// and treat them as local filesystem paths. Only absolute local filesystem +/// paths are allowed. +ARROW_EXPORT +Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath( + const std::string& uri, std::string* out_path = NULLPTR); + +/// \brief Create a new FileSystem by URI with a custom IO context +/// +/// Same as FileSystemFromUri, but in addition also recognize non-URIs +/// and treat them as local filesystem paths. Only absolute local filesystem +/// paths are allowed. +ARROW_EXPORT +Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath( + const std::string& uri, const io::IOContext& io_context, + std::string* out_path = NULLPTR); + +/// @} + +/// \brief Copy files, including from one FileSystem to another +/// +/// If a source and destination are resident in the same FileSystem FileSystem::CopyFile +/// will be used, otherwise the file will be opened as a stream in both FileSystems and +/// chunks copied from the source to the destination. No directories will be created. +ARROW_EXPORT +Status CopyFiles(const std::vector<FileLocator>& sources, + const std::vector<FileLocator>& destinations, + const io::IOContext& io_context = io::default_io_context(), + int64_t chunk_size = 1024 * 1024, bool use_threads = true); + +/// \brief Copy selected files, including from one FileSystem to another +/// +/// Directories will be created under the destination base directory as needed. +ARROW_EXPORT +Status CopyFiles(const std::shared_ptr<FileSystem>& source_fs, + const FileSelector& source_sel, + const std::shared_ptr<FileSystem>& destination_fs, + const std::string& destination_base_dir, + const io::IOContext& io_context = io::default_io_context(), + int64_t chunk_size = 1024 * 1024, bool use_threads = true); + +struct FileSystemGlobalOptions { + /// Path to a single PEM file holding all TLS CA certificates + /// + /// If empty, the underlying TLS library's defaults will be used. + std::string tls_ca_file_path; + + /// Path to a directory holding TLS CA certificates in individual PEM files + /// named along the OpenSSL "hashed" format. + /// + /// If empty, the underlying TLS library's defaults will be used. + std::string tls_ca_dir_path; +}; + +/// Experimental: optional global initialization routine +/// +/// This is for environments (such as manylinux) where the path +/// to TLS CA certificates needs to be configured at runtime. +ARROW_EXPORT +Status Initialize(const FileSystemGlobalOptions& options); + +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/localfs.cc b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/localfs.cc new file mode 100644 index 0000000000..775fd746aa --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/localfs.cc @@ -0,0 +1,448 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <chrono> +#include <cstring> +#include <sstream> +#include <utility> + +#ifdef _WIN32 +#include "arrow/util/windows_compatibility.h" +#else +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <sys/stat.h> +#endif + +#include "arrow/filesystem/localfs.h" +#include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/util_internal.h" +#include "arrow/io/file.h" +#include "arrow/util/io_util.h" +#include "arrow/util/logging.h" +#include "arrow/util/uri.h" +#include "arrow/util/windows_fixup.h" + +namespace arrow { +namespace fs { + +using ::arrow::internal::IOErrorFromErrno; +#ifdef _WIN32 +using ::arrow::internal::IOErrorFromWinError; +#endif +using ::arrow::internal::NativePathString; +using ::arrow::internal::PlatformFilename; + +namespace internal { + +#ifdef _WIN32 +static bool IsDriveLetter(char c) { + // Can't use locale-dependent functions from the C/C++ stdlib + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); +} +#endif + +bool DetectAbsolutePath(const std::string& s) { + // Is it a /-prefixed local path? + if (s.length() >= 1 && s[0] == '/') { + return true; + } +#ifdef _WIN32 + // Is it a \-prefixed local path? + if (s.length() >= 1 && s[0] == '\\') { + return true; + } + // Does it start with a drive letter in addition to being /- or \-prefixed, + // e.g. "C:\..."? + if (s.length() >= 3 && s[1] == ':' && (s[2] == '/' || s[2] == '\\') && + IsDriveLetter(s[0])) { + return true; + } +#endif + return false; +} + +} // namespace internal + +namespace { + +#ifdef _WIN32 + +std::string NativeToString(const NativePathString& ns) { + PlatformFilename fn(ns); + return fn.ToString(); +} + +TimePoint ToTimePoint(FILETIME ft) { + // Hundreds of nanoseconds between January 1, 1601 (UTC) and the Unix epoch. + static constexpr int64_t kFileTimeEpoch = 11644473600LL * 10000000; + + int64_t hundreds = (static_cast<int64_t>(ft.dwHighDateTime) << 32) + ft.dwLowDateTime - + kFileTimeEpoch; // hundreds of ns since Unix epoch + std::chrono::nanoseconds ns_count(100 * hundreds); + return TimePoint(std::chrono::duration_cast<TimePoint::duration>(ns_count)); +} + +FileInfo FileInformationToFileInfo(const BY_HANDLE_FILE_INFORMATION& information) { + FileInfo info; + if (information.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { + info.set_type(FileType::Directory); + info.set_size(kNoSize); + } else { + // Regular file + info.set_type(FileType::File); + info.set_size((static_cast<int64_t>(information.nFileSizeHigh) << 32) + + information.nFileSizeLow); + } + info.set_mtime(ToTimePoint(information.ftLastWriteTime)); + return info; +} + +Result<FileInfo> StatFile(const std::wstring& path) { + HANDLE h; + std::string bytes_path = NativeToString(path); + FileInfo info; + + /* Inspired by CPython, see Modules/posixmodule.c */ + h = CreateFileW(path.c_str(), FILE_READ_ATTRIBUTES, /* desired access */ + 0, /* share mode */ + NULL, /* security attributes */ + OPEN_EXISTING, + /* FILE_FLAG_BACKUP_SEMANTICS is required to open a directory */ + FILE_ATTRIBUTE_NORMAL | FILE_FLAG_BACKUP_SEMANTICS, NULL); + + if (h == INVALID_HANDLE_VALUE) { + DWORD err = GetLastError(); + if (err == ERROR_FILE_NOT_FOUND || err == ERROR_PATH_NOT_FOUND) { + info.set_path(bytes_path); + info.set_type(FileType::NotFound); + info.set_mtime(kNoTime); + info.set_size(kNoSize); + return info; + } else { + return IOErrorFromWinError(GetLastError(), "Failed querying information for path '", + bytes_path, "'"); + } + } + BY_HANDLE_FILE_INFORMATION information; + if (!GetFileInformationByHandle(h, &information)) { + CloseHandle(h); + return IOErrorFromWinError(GetLastError(), "Failed querying information for path '", + bytes_path, "'"); + } + CloseHandle(h); + info = FileInformationToFileInfo(information); + info.set_path(bytes_path); + return info; +} + +#else // POSIX systems + +TimePoint ToTimePoint(const struct timespec& s) { + std::chrono::nanoseconds ns_count(static_cast<int64_t>(s.tv_sec) * 1000000000 + + static_cast<int64_t>(s.tv_nsec)); + return TimePoint(std::chrono::duration_cast<TimePoint::duration>(ns_count)); +} + +FileInfo StatToFileInfo(const struct stat& s) { + FileInfo info; + if (S_ISREG(s.st_mode)) { + info.set_type(FileType::File); + info.set_size(static_cast<int64_t>(s.st_size)); + } else if (S_ISDIR(s.st_mode)) { + info.set_type(FileType::Directory); + info.set_size(kNoSize); + } else { + info.set_type(FileType::Unknown); + info.set_size(kNoSize); + } +#ifdef __APPLE__ + // macOS doesn't use the POSIX-compliant spelling + info.set_mtime(ToTimePoint(s.st_mtimespec)); +#else + info.set_mtime(ToTimePoint(s.st_mtim)); +#endif + return info; +} + +Result<FileInfo> StatFile(const std::string& path) { + FileInfo info; + struct stat s; + int r = stat(path.c_str(), &s); + if (r == -1) { + if (errno == ENOENT || errno == ENOTDIR || errno == ELOOP) { + info.set_type(FileType::NotFound); + info.set_mtime(kNoTime); + info.set_size(kNoSize); + } else { + return IOErrorFromErrno(errno, "Failed stat()ing path '", path, "'"); + } + } else { + info = StatToFileInfo(s); + } + info.set_path(path); + return info; +} + +#endif + +Status StatSelector(const PlatformFilename& dir_fn, const FileSelector& select, + int32_t nesting_depth, std::vector<FileInfo>* out) { + auto result = ListDir(dir_fn); + if (!result.ok()) { + auto status = result.status(); + if (select.allow_not_found && status.IsIOError()) { + ARROW_ASSIGN_OR_RAISE(bool exists, FileExists(dir_fn)); + if (!exists) { + return Status::OK(); + } + } + return status; + } + + for (const auto& child_fn : *result) { + PlatformFilename full_fn = dir_fn.Join(child_fn); + ARROW_ASSIGN_OR_RAISE(FileInfo info, StatFile(full_fn.ToNative())); + if (info.type() != FileType::NotFound) { + out->push_back(std::move(info)); + } + if (nesting_depth < select.max_recursion && select.recursive && + info.type() == FileType::Directory) { + RETURN_NOT_OK(StatSelector(full_fn, select, nesting_depth + 1, out)); + } + } + return Status::OK(); +} + +} // namespace + +LocalFileSystemOptions LocalFileSystemOptions::Defaults() { + return LocalFileSystemOptions(); +} + +bool LocalFileSystemOptions::Equals(const LocalFileSystemOptions& other) const { + return use_mmap == other.use_mmap; +} + +Result<LocalFileSystemOptions> LocalFileSystemOptions::FromUri( + const ::arrow::internal::Uri& uri, std::string* out_path) { + if (!uri.username().empty() || !uri.password().empty()) { + return Status::Invalid("Unsupported username or password in local URI: '", + uri.ToString(), "'"); + } + std::string path; + const auto host = uri.host(); + if (!host.empty()) { +#ifdef _WIN32 + std::stringstream ss; + ss << "//" << host << "/" << internal::RemoveLeadingSlash(uri.path()); + *out_path = ss.str(); +#else + return Status::Invalid("Unsupported hostname in non-Windows local URI: '", + uri.ToString(), "'"); +#endif + } else { + *out_path = uri.path(); + } + + // TODO handle use_mmap option + return LocalFileSystemOptions(); +} + +LocalFileSystem::LocalFileSystem(const io::IOContext& io_context) + : FileSystem(io_context), options_(LocalFileSystemOptions::Defaults()) {} + +LocalFileSystem::LocalFileSystem(const LocalFileSystemOptions& options, + const io::IOContext& io_context) + : FileSystem(io_context), options_(options) {} + +LocalFileSystem::~LocalFileSystem() {} + +Result<std::string> LocalFileSystem::NormalizePath(std::string path) { + ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); + return fn.ToString(); +} + +bool LocalFileSystem::Equals(const FileSystem& other) const { + if (other.type_name() != type_name()) { + return false; + } else { + const auto& localfs = ::arrow::internal::checked_cast<const LocalFileSystem&>(other); + return options_.Equals(localfs.options()); + } +} + +Result<FileInfo> LocalFileSystem::GetFileInfo(const std::string& path) { + ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); + return StatFile(fn.ToNative()); +} + +Result<std::vector<FileInfo>> LocalFileSystem::GetFileInfo(const FileSelector& select) { + ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(select.base_dir)); + std::vector<FileInfo> results; + RETURN_NOT_OK(StatSelector(fn, select, 0, &results)); + return results; +} + +Status LocalFileSystem::CreateDir(const std::string& path, bool recursive) { + ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); + if (recursive) { + return ::arrow::internal::CreateDirTree(fn).status(); + } else { + return ::arrow::internal::CreateDir(fn).status(); + } +} + +Status LocalFileSystem::DeleteDir(const std::string& path) { + ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); + auto st = ::arrow::internal::DeleteDirTree(fn, /*allow_not_found=*/false).status(); + if (!st.ok()) { + // TODO Status::WithPrefix()? + std::stringstream ss; + ss << "Cannot delete directory '" << path << "': " << st.message(); + return st.WithMessage(ss.str()); + } + return Status::OK(); +} + +Status LocalFileSystem::DeleteDirContents(const std::string& path) { + if (internal::IsEmptyPath(path)) { + return internal::InvalidDeleteDirContents(path); + } + ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); + auto st = ::arrow::internal::DeleteDirContents(fn, /*allow_not_found=*/false).status(); + if (!st.ok()) { + std::stringstream ss; + ss << "Cannot delete directory contents in '" << path << "': " << st.message(); + return st.WithMessage(ss.str()); + } + return Status::OK(); +} + +Status LocalFileSystem::DeleteRootDirContents() { + return Status::Invalid("LocalFileSystem::DeleteRootDirContents is strictly forbidden"); +} + +Status LocalFileSystem::DeleteFile(const std::string& path) { + ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); + return ::arrow::internal::DeleteFile(fn, /*allow_not_found=*/false).status(); +} + +Status LocalFileSystem::Move(const std::string& src, const std::string& dest) { + ARROW_ASSIGN_OR_RAISE(auto sfn, PlatformFilename::FromString(src)); + ARROW_ASSIGN_OR_RAISE(auto dfn, PlatformFilename::FromString(dest)); + +#ifdef _WIN32 + if (!MoveFileExW(sfn.ToNative().c_str(), dfn.ToNative().c_str(), + MOVEFILE_REPLACE_EXISTING)) { + return IOErrorFromWinError(GetLastError(), "Failed renaming '", sfn.ToString(), + "' to '", dfn.ToString(), "'"); + } +#else + if (rename(sfn.ToNative().c_str(), dfn.ToNative().c_str()) == -1) { + return IOErrorFromErrno(errno, "Failed renaming '", sfn.ToString(), "' to '", + dfn.ToString(), "'"); + } +#endif + return Status::OK(); +} + +Status LocalFileSystem::CopyFile(const std::string& src, const std::string& dest) { + ARROW_ASSIGN_OR_RAISE(auto sfn, PlatformFilename::FromString(src)); + ARROW_ASSIGN_OR_RAISE(auto dfn, PlatformFilename::FromString(dest)); + // XXX should we use fstat() to compare inodes? + if (sfn.ToNative() == dfn.ToNative()) { + return Status::OK(); + } + +#ifdef _WIN32 + if (!CopyFileW(sfn.ToNative().c_str(), dfn.ToNative().c_str(), + FALSE /* bFailIfExists */)) { + return IOErrorFromWinError(GetLastError(), "Failed copying '", sfn.ToString(), + "' to '", dfn.ToString(), "'"); + } + return Status::OK(); +#else + ARROW_ASSIGN_OR_RAISE(auto is, OpenInputStream(src)); + ARROW_ASSIGN_OR_RAISE(auto os, OpenOutputStream(dest)); + RETURN_NOT_OK(internal::CopyStream(is, os, 1024 * 1024 /* chunk_size */, io_context())); + RETURN_NOT_OK(os->Close()); + return is->Close(); +#endif +} + +namespace { + +template <typename InputStreamType> +Result<std::shared_ptr<InputStreamType>> OpenInputStreamGeneric( + const std::string& path, const LocalFileSystemOptions& options, + const io::IOContext& io_context) { + if (options.use_mmap) { + return io::MemoryMappedFile::Open(path, io::FileMode::READ); + } else { + return io::ReadableFile::Open(path, io_context.pool()); + } +} + +} // namespace + +Result<std::shared_ptr<io::InputStream>> LocalFileSystem::OpenInputStream( + const std::string& path) { + return OpenInputStreamGeneric<io::InputStream>(path, options_, io_context()); +} + +Result<std::shared_ptr<io::RandomAccessFile>> LocalFileSystem::OpenInputFile( + const std::string& path) { + return OpenInputStreamGeneric<io::RandomAccessFile>(path, options_, io_context()); +} + +namespace { + +Result<std::shared_ptr<io::OutputStream>> OpenOutputStreamGeneric(const std::string& path, + bool truncate, + bool append) { + int fd; + bool write_only = true; + ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path)); + ARROW_ASSIGN_OR_RAISE( + fd, ::arrow::internal::FileOpenWritable(fn, write_only, truncate, append)); + auto maybe_stream = io::FileOutputStream::Open(fd); + if (!maybe_stream.ok()) { + ARROW_UNUSED(::arrow::internal::FileClose(fd)); + } + return maybe_stream; +} + +} // namespace + +Result<std::shared_ptr<io::OutputStream>> LocalFileSystem::OpenOutputStream( + const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) { + bool truncate = true; + bool append = false; + return OpenOutputStreamGeneric(path, truncate, append); +} + +Result<std::shared_ptr<io::OutputStream>> LocalFileSystem::OpenAppendStream( + const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) { + bool truncate = false; + bool append = true; + return OpenOutputStreamGeneric(path, truncate, append); +} + +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/localfs.h b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/localfs.h new file mode 100644 index 0000000000..f8e77aee59 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/localfs.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> +#include <string> +#include <vector> + +#include "arrow/filesystem/filesystem.h" + +namespace arrow { +namespace internal { + +class Uri; + +} + +namespace fs { + +/// Options for the LocalFileSystem implementation. +struct ARROW_EXPORT LocalFileSystemOptions { + /// Whether OpenInputStream and OpenInputFile return a mmap'ed file, + /// or a regular one. + bool use_mmap = false; + + /// \brief Initialize with defaults + static LocalFileSystemOptions Defaults(); + + bool Equals(const LocalFileSystemOptions& other) const; + + static Result<LocalFileSystemOptions> FromUri(const ::arrow::internal::Uri& uri, + std::string* out_path); +}; + +/// \brief A FileSystem implementation accessing files on the local machine. +/// +/// This class handles only `/`-separated paths. If desired, conversion +/// from Windows backslash-separated paths should be done by the caller. +/// Details such as symlinks are abstracted away (symlinks are always +/// followed, except when deleting an entry). +class ARROW_EXPORT LocalFileSystem : public FileSystem { + public: + explicit LocalFileSystem(const io::IOContext& = io::default_io_context()); + explicit LocalFileSystem(const LocalFileSystemOptions&, + const io::IOContext& = io::default_io_context()); + ~LocalFileSystem() override; + + std::string type_name() const override { return "local"; } + + Result<std::string> NormalizePath(std::string path) override; + + bool Equals(const FileSystem& other) const override; + + LocalFileSystemOptions options() const { return options_; } + + /// \cond FALSE + using FileSystem::GetFileInfo; + /// \endcond + Result<FileInfo> GetFileInfo(const std::string& path) override; + Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result<std::shared_ptr<io::InputStream>> OpenInputStream( + const std::string& path) override; + Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const std::string& path) override; + Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override; + Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override; + + protected: + LocalFileSystemOptions options_; +}; + +namespace internal { + +// Return whether the string is detected as a local absolute path. +ARROW_EXPORT +bool DetectAbsolutePath(const std::string& s); + +} // namespace internal + +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/mockfs.cc b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/mockfs.cc new file mode 100644 index 0000000000..14a38283b2 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/mockfs.cc @@ -0,0 +1,780 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <algorithm> +#include <iterator> +#include <map> +#include <mutex> +#include <sstream> +#include <string> +#include <utility> +#include <vector> + +#include "arrow/buffer.h" +#include "arrow/buffer_builder.h" +#include "arrow/filesystem/mockfs.h" +#include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/util_internal.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/future.h" +#include "arrow/util/logging.h" +#include "arrow/util/string_view.h" +#include "arrow/util/variant.h" +#include "arrow/util/windows_fixup.h" + +namespace arrow { +namespace fs { +namespace internal { + +namespace { + +//////////////////////////////////////////////////////////////////////////// +// Filesystem structure + +class Entry; + +struct File { + TimePoint mtime; + std::string name; + std::shared_ptr<Buffer> data; + std::shared_ptr<const KeyValueMetadata> metadata; + + File(TimePoint mtime, std::string name) : mtime(mtime), name(std::move(name)) {} + + int64_t size() const { return data ? data->size() : 0; } + + explicit operator util::string_view() const { + if (data) { + return util::string_view(*data); + } else { + return ""; + } + } +}; + +struct Directory { + std::string name; + TimePoint mtime; + std::map<std::string, std::unique_ptr<Entry>> entries; + + Directory(std::string name, TimePoint mtime) : name(std::move(name)), mtime(mtime) {} + Directory(Directory&& other) noexcept + : name(std::move(other.name)), + mtime(other.mtime), + entries(std::move(other.entries)) {} + + Directory& operator=(Directory&& other) noexcept { + name = std::move(other.name); + mtime = other.mtime; + entries = std::move(other.entries); + return *this; + } + + Entry* Find(const std::string& s) { + auto it = entries.find(s); + if (it != entries.end()) { + return it->second.get(); + } else { + return nullptr; + } + } + + bool CreateEntry(const std::string& s, std::unique_ptr<Entry> entry) { + DCHECK(!s.empty()); + auto p = entries.emplace(s, std::move(entry)); + return p.second; + } + + void AssignEntry(const std::string& s, std::unique_ptr<Entry> entry) { + DCHECK(!s.empty()); + entries[s] = std::move(entry); + } + + bool DeleteEntry(const std::string& s) { return entries.erase(s) > 0; } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Directory); +}; + +// A filesystem entry +using EntryBase = util::Variant<std::nullptr_t, File, Directory>; + +class Entry : public EntryBase { + public: + Entry(Entry&&) = default; + Entry& operator=(Entry&&) = default; + explicit Entry(Directory&& v) : EntryBase(std::move(v)) {} + explicit Entry(File&& v) : EntryBase(std::move(v)) {} + + bool is_dir() const { return util::holds_alternative<Directory>(*this); } + + bool is_file() const { return util::holds_alternative<File>(*this); } + + Directory& as_dir() { return util::get<Directory>(*this); } + + File& as_file() { return util::get<File>(*this); } + + // Get info for this entry. Note the path() property isn't set. + FileInfo GetInfo() { + FileInfo info; + if (is_dir()) { + Directory& dir = as_dir(); + info.set_type(FileType::Directory); + info.set_mtime(dir.mtime); + } else { + DCHECK(is_file()); + File& file = as_file(); + info.set_type(FileType::File); + info.set_mtime(file.mtime); + info.set_size(file.size()); + } + return info; + } + + // Get info for this entry, knowing the parent path. + FileInfo GetInfo(const std::string& base_path) { + FileInfo info; + if (is_dir()) { + Directory& dir = as_dir(); + info.set_type(FileType::Directory); + info.set_mtime(dir.mtime); + info.set_path(ConcatAbstractPath(base_path, dir.name)); + } else { + DCHECK(is_file()); + File& file = as_file(); + info.set_type(FileType::File); + info.set_mtime(file.mtime); + info.set_size(file.size()); + info.set_path(ConcatAbstractPath(base_path, file.name)); + } + return info; + } + + // Set the entry name + void SetName(const std::string& name) { + if (is_dir()) { + as_dir().name = name; + } else { + DCHECK(is_file()); + as_file().name = name; + } + } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(Entry); +}; + +//////////////////////////////////////////////////////////////////////////// +// Streams + +class MockFSOutputStream : public io::OutputStream { + public: + MockFSOutputStream(File* file, MemoryPool* pool) + : file_(file), builder_(pool), closed_(false) {} + + ~MockFSOutputStream() override = default; + + // Implement the OutputStream interface + Status Close() override { + if (!closed_) { + RETURN_NOT_OK(builder_.Finish(&file_->data)); + closed_ = true; + } + return Status::OK(); + } + + Status Abort() override { + if (!closed_) { + // MockFSOutputStream is mainly used for debugging and testing, so + // mark an aborted file's contents explicitly. + std::stringstream ss; + ss << "MockFSOutputStream aborted after " << file_->size() << " bytes written"; + file_->data = Buffer::FromString(ss.str()); + closed_ = true; + } + return Status::OK(); + } + + bool closed() const override { return closed_; } + + Result<int64_t> Tell() const override { + if (closed_) { + return Status::Invalid("Invalid operation on closed stream"); + } + return builder_.length(); + } + + Status Write(const void* data, int64_t nbytes) override { + if (closed_) { + return Status::Invalid("Invalid operation on closed stream"); + } + return builder_.Append(data, nbytes); + } + + protected: + File* file_; + BufferBuilder builder_; + bool closed_; +}; + +class MockFSInputStream : public io::BufferReader { + public: + explicit MockFSInputStream(const File& file) + : io::BufferReader(file.data), metadata_(file.metadata) {} + + Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override { + return metadata_; + } + + protected: + std::shared_ptr<const KeyValueMetadata> metadata_; +}; + +} // namespace + +std::ostream& operator<<(std::ostream& os, const MockDirInfo& di) { + return os << "'" << di.full_path << "' [mtime=" << di.mtime.time_since_epoch().count() + << "]"; +} + +std::ostream& operator<<(std::ostream& os, const MockFileInfo& di) { + return os << "'" << di.full_path << "' [mtime=" << di.mtime.time_since_epoch().count() + << ", size=" << di.data.length() << "]"; +} + +//////////////////////////////////////////////////////////////////////////// +// MockFileSystem implementation + +class MockFileSystem::Impl { + public: + TimePoint current_time; + MemoryPool* pool; + + // The root directory + Entry root; + std::mutex mutex; + + Impl(TimePoint current_time, MemoryPool* pool) + : current_time(current_time), pool(pool), root(Directory("", current_time)) {} + + std::unique_lock<std::mutex> lock_guard() { + return std::unique_lock<std::mutex>(mutex); + } + + Directory& RootDir() { return root.as_dir(); } + + template <typename It> + Entry* FindEntry(It it, It end, size_t* nconsumed) { + size_t consumed = 0; + Entry* entry = &root; + + for (; it != end; ++it) { + const std::string& part = *it; + DCHECK(entry->is_dir()); + Entry* child = entry->as_dir().Find(part); + if (child == nullptr) { + // Partial find only + break; + } + ++consumed; + entry = child; + if (entry->is_file()) { + // Cannot go any further + break; + } + // Recurse + } + *nconsumed = consumed; + return entry; + } + + // Find an entry, allowing partial matching + Entry* FindEntry(const std::vector<std::string>& parts, size_t* nconsumed) { + return FindEntry(parts.begin(), parts.end(), nconsumed); + } + + // Find an entry, only full matching allowed + Entry* FindEntry(const std::vector<std::string>& parts) { + size_t consumed; + auto entry = FindEntry(parts, &consumed); + return (consumed == parts.size()) ? entry : nullptr; + } + + // Find the parent entry, only full matching allowed + Entry* FindParent(const std::vector<std::string>& parts) { + if (parts.size() == 0) { + return nullptr; + } + size_t consumed; + auto last = parts.end(); + last--; + auto entry = FindEntry(parts.begin(), last, &consumed); + return (consumed == parts.size() - 1) ? entry : nullptr; + } + + void GatherInfos(const FileSelector& select, const std::string& base_path, + const Directory& base_dir, int32_t nesting_depth, + std::vector<FileInfo>* infos) { + for (const auto& pair : base_dir.entries) { + Entry* child = pair.second.get(); + infos->push_back(child->GetInfo(base_path)); + if (select.recursive && nesting_depth < select.max_recursion && child->is_dir()) { + Directory& child_dir = child->as_dir(); + std::string child_path = infos->back().path(); + GatherInfos(select, std::move(child_path), child_dir, nesting_depth + 1, infos); + } + } + } + + void DumpDirs(const std::string& prefix, const Directory& dir, + std::vector<MockDirInfo>* out) { + std::string path = prefix + dir.name; + if (!path.empty()) { + out->push_back({path, dir.mtime}); + path += "/"; + } + for (const auto& pair : dir.entries) { + Entry* child = pair.second.get(); + if (child->is_dir()) { + DumpDirs(path, child->as_dir(), out); + } + } + } + + void DumpFiles(const std::string& prefix, const Directory& dir, + std::vector<MockFileInfo>* out) { + std::string path = prefix + dir.name; + if (!path.empty()) { + path += "/"; + } + for (const auto& pair : dir.entries) { + Entry* child = pair.second.get(); + if (child->is_file()) { + auto& file = child->as_file(); + out->push_back({path + file.name, file.mtime, util::string_view(file)}); + } else if (child->is_dir()) { + DumpFiles(path, child->as_dir(), out); + } + } + } + + Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( + const std::string& path, bool append, + const std::shared_ptr<const KeyValueMetadata>& metadata) { + auto parts = SplitAbstractPath(path); + RETURN_NOT_OK(ValidateAbstractPathParts(parts)); + + Entry* parent = FindParent(parts); + if (parent == nullptr || !parent->is_dir()) { + return PathNotFound(path); + } + // Find the file in the parent dir, or create it + const auto& name = parts.back(); + Entry* child = parent->as_dir().Find(name); + File* file; + if (child == nullptr) { + child = new Entry(File(current_time, name)); + parent->as_dir().AssignEntry(name, std::unique_ptr<Entry>(child)); + file = &child->as_file(); + } else if (child->is_file()) { + file = &child->as_file(); + file->mtime = current_time; + } else { + return NotAFile(path); + } + file->metadata = metadata; + auto ptr = std::make_shared<MockFSOutputStream>(file, pool); + if (append && file->data) { + RETURN_NOT_OK(ptr->Write(file->data->data(), file->data->size())); + } + return ptr; + } + + Result<std::shared_ptr<io::BufferReader>> OpenInputReader(const std::string& path) { + auto parts = SplitAbstractPath(path); + RETURN_NOT_OK(ValidateAbstractPathParts(parts)); + + Entry* entry = FindEntry(parts); + if (entry == nullptr) { + return PathNotFound(path); + } + if (!entry->is_file()) { + return NotAFile(path); + } + return std::make_shared<MockFSInputStream>(entry->as_file()); + } +}; + +MockFileSystem::~MockFileSystem() = default; + +MockFileSystem::MockFileSystem(TimePoint current_time, const io::IOContext& io_context) { + impl_ = std::unique_ptr<Impl>(new Impl(current_time, io_context.pool())); +} + +bool MockFileSystem::Equals(const FileSystem& other) const { return this == &other; } + +Status MockFileSystem::CreateDir(const std::string& path, bool recursive) { + auto parts = SplitAbstractPath(path); + RETURN_NOT_OK(ValidateAbstractPathParts(parts)); + + auto guard = impl_->lock_guard(); + + size_t consumed; + Entry* entry = impl_->FindEntry(parts, &consumed); + if (!entry->is_dir()) { + auto file_path = JoinAbstractPath(parts.begin(), parts.begin() + consumed); + return Status::IOError("Cannot create directory '", path, "': ", "ancestor '", + file_path, "' is not a directory"); + } + if (!recursive && (parts.size() - consumed) > 1) { + return Status::IOError("Cannot create directory '", path, + "': ", "parent does not exist"); + } + for (size_t i = consumed; i < parts.size(); ++i) { + const auto& name = parts[i]; + std::unique_ptr<Entry> child(new Entry(Directory(name, impl_->current_time))); + Entry* child_ptr = child.get(); + bool inserted = entry->as_dir().CreateEntry(name, std::move(child)); + // No race condition on insertion is possible, as all operations are locked + DCHECK(inserted); + entry = child_ptr; + } + return Status::OK(); +} + +Status MockFileSystem::DeleteDir(const std::string& path) { + auto parts = SplitAbstractPath(path); + RETURN_NOT_OK(ValidateAbstractPathParts(parts)); + + auto guard = impl_->lock_guard(); + + Entry* parent = impl_->FindParent(parts); + if (parent == nullptr || !parent->is_dir()) { + return PathNotFound(path); + } + Directory& parent_dir = parent->as_dir(); + auto child = parent_dir.Find(parts.back()); + if (child == nullptr) { + return PathNotFound(path); + } + if (!child->is_dir()) { + return NotADir(path); + } + + bool deleted = parent_dir.DeleteEntry(parts.back()); + DCHECK(deleted); + return Status::OK(); +} + +Status MockFileSystem::DeleteDirContents(const std::string& path) { + auto parts = SplitAbstractPath(path); + RETURN_NOT_OK(ValidateAbstractPathParts(parts)); + + auto guard = impl_->lock_guard(); + + if (parts.empty()) { + // Wipe filesystem + return internal::InvalidDeleteDirContents(path); + } + + Entry* entry = impl_->FindEntry(parts); + if (entry == nullptr) { + return PathNotFound(path); + } + if (!entry->is_dir()) { + return NotADir(path); + } + entry->as_dir().entries.clear(); + return Status::OK(); +} + +Status MockFileSystem::DeleteRootDirContents() { + auto guard = impl_->lock_guard(); + + impl_->RootDir().entries.clear(); + return Status::OK(); +} + +Status MockFileSystem::DeleteFile(const std::string& path) { + auto parts = SplitAbstractPath(path); + RETURN_NOT_OK(ValidateAbstractPathParts(parts)); + + auto guard = impl_->lock_guard(); + + Entry* parent = impl_->FindParent(parts); + if (parent == nullptr || !parent->is_dir()) { + return PathNotFound(path); + } + Directory& parent_dir = parent->as_dir(); + auto child = parent_dir.Find(parts.back()); + if (child == nullptr) { + return PathNotFound(path); + } + if (!child->is_file()) { + return NotAFile(path); + } + bool deleted = parent_dir.DeleteEntry(parts.back()); + DCHECK(deleted); + return Status::OK(); +} + +Result<FileInfo> MockFileSystem::GetFileInfo(const std::string& path) { + auto parts = SplitAbstractPath(path); + RETURN_NOT_OK(ValidateAbstractPathParts(parts)); + + auto guard = impl_->lock_guard(); + + FileInfo info; + Entry* entry = impl_->FindEntry(parts); + if (entry == nullptr) { + info.set_type(FileType::NotFound); + } else { + info = entry->GetInfo(); + } + info.set_path(path); + return info; +} + +Result<FileInfoVector> MockFileSystem::GetFileInfo(const FileSelector& selector) { + auto parts = SplitAbstractPath(selector.base_dir); + RETURN_NOT_OK(ValidateAbstractPathParts(parts)); + + auto guard = impl_->lock_guard(); + + FileInfoVector results; + + Entry* base_dir = impl_->FindEntry(parts); + if (base_dir == nullptr) { + // Base directory does not exist + if (selector.allow_not_found) { + return results; + } else { + return PathNotFound(selector.base_dir); + } + } + if (!base_dir->is_dir()) { + return NotADir(selector.base_dir); + } + + impl_->GatherInfos(selector, selector.base_dir, base_dir->as_dir(), 0, &results); + return results; +} + +namespace { + +// Helper for binary operations (move, copy) +struct BinaryOp { + std::vector<std::string> src_parts; + std::vector<std::string> dest_parts; + Directory& src_dir; + Directory& dest_dir; + std::string src_name; + std::string dest_name; + Entry* src_entry; + Entry* dest_entry; + + template <typename OpFunc> + static Status Run(MockFileSystem::Impl* impl, const std::string& src, + const std::string& dest, OpFunc&& op_func) { + auto src_parts = SplitAbstractPath(src); + auto dest_parts = SplitAbstractPath(dest); + RETURN_NOT_OK(ValidateAbstractPathParts(src_parts)); + RETURN_NOT_OK(ValidateAbstractPathParts(dest_parts)); + + auto guard = impl->lock_guard(); + + // Both source and destination must have valid parents + Entry* src_parent = impl->FindParent(src_parts); + if (src_parent == nullptr || !src_parent->is_dir()) { + return PathNotFound(src); + } + Entry* dest_parent = impl->FindParent(dest_parts); + if (dest_parent == nullptr || !dest_parent->is_dir()) { + return PathNotFound(dest); + } + Directory& src_dir = src_parent->as_dir(); + Directory& dest_dir = dest_parent->as_dir(); + DCHECK_GE(src_parts.size(), 1); + DCHECK_GE(dest_parts.size(), 1); + const auto& src_name = src_parts.back(); + const auto& dest_name = dest_parts.back(); + + BinaryOp op{std::move(src_parts), + std::move(dest_parts), + src_dir, + dest_dir, + src_name, + dest_name, + src_dir.Find(src_name), + dest_dir.Find(dest_name)}; + + return op_func(std::move(op)); + } +}; + +} // namespace + +Status MockFileSystem::Move(const std::string& src, const std::string& dest) { + return BinaryOp::Run(impl_.get(), src, dest, [&](const BinaryOp& op) -> Status { + if (op.src_entry == nullptr) { + return PathNotFound(src); + } + if (op.dest_entry != nullptr) { + if (op.dest_entry->is_dir()) { + return Status::IOError("Cannot replace destination '", dest, + "', which is a directory"); + } + if (op.dest_entry->is_file() && op.src_entry->is_dir()) { + return Status::IOError("Cannot replace destination '", dest, + "', which is a file, with directory '", src, "'"); + } + } + if (op.src_parts.size() < op.dest_parts.size()) { + // Check if dest is a child of src + auto p = + std::mismatch(op.src_parts.begin(), op.src_parts.end(), op.dest_parts.begin()); + if (p.first == op.src_parts.end()) { + return Status::IOError("Cannot move '", src, "' into child path '", dest, "'"); + } + } + + // Move original entry, fix its name + std::unique_ptr<Entry> new_entry(new Entry(std::move(*op.src_entry))); + new_entry->SetName(op.dest_name); + bool deleted = op.src_dir.DeleteEntry(op.src_name); + DCHECK(deleted); + op.dest_dir.AssignEntry(op.dest_name, std::move(new_entry)); + return Status::OK(); + }); +} + +Status MockFileSystem::CopyFile(const std::string& src, const std::string& dest) { + return BinaryOp::Run(impl_.get(), src, dest, [&](const BinaryOp& op) -> Status { + if (op.src_entry == nullptr) { + return PathNotFound(src); + } + if (!op.src_entry->is_file()) { + return NotAFile(src); + } + if (op.dest_entry != nullptr && op.dest_entry->is_dir()) { + return Status::IOError("Cannot replace destination '", dest, + "', which is a directory"); + } + + // Copy original entry, fix its name + std::unique_ptr<Entry> new_entry(new Entry(File(op.src_entry->as_file()))); + new_entry->SetName(op.dest_name); + op.dest_dir.AssignEntry(op.dest_name, std::move(new_entry)); + return Status::OK(); + }); +} + +Result<std::shared_ptr<io::InputStream>> MockFileSystem::OpenInputStream( + const std::string& path) { + auto guard = impl_->lock_guard(); + + return impl_->OpenInputReader(path); +} + +Result<std::shared_ptr<io::RandomAccessFile>> MockFileSystem::OpenInputFile( + const std::string& path) { + auto guard = impl_->lock_guard(); + + return impl_->OpenInputReader(path); +} + +Result<std::shared_ptr<io::OutputStream>> MockFileSystem::OpenOutputStream( + const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) { + auto guard = impl_->lock_guard(); + + return impl_->OpenOutputStream(path, /*append=*/false, metadata); +} + +Result<std::shared_ptr<io::OutputStream>> MockFileSystem::OpenAppendStream( + const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) { + auto guard = impl_->lock_guard(); + + return impl_->OpenOutputStream(path, /*append=*/true, metadata); +} + +std::vector<MockDirInfo> MockFileSystem::AllDirs() { + auto guard = impl_->lock_guard(); + + std::vector<MockDirInfo> result; + impl_->DumpDirs("", impl_->RootDir(), &result); + return result; +} + +std::vector<MockFileInfo> MockFileSystem::AllFiles() { + auto guard = impl_->lock_guard(); + + std::vector<MockFileInfo> result; + impl_->DumpFiles("", impl_->RootDir(), &result); + return result; +} + +Status MockFileSystem::CreateFile(const std::string& path, util::string_view contents, + bool recursive) { + auto parent = fs::internal::GetAbstractPathParent(path).first; + + if (parent != "") { + RETURN_NOT_OK(CreateDir(parent, recursive)); + } + + ARROW_ASSIGN_OR_RAISE(auto file, OpenOutputStream(path)); + RETURN_NOT_OK(file->Write(contents)); + return file->Close(); +} + +Result<std::shared_ptr<FileSystem>> MockFileSystem::Make( + TimePoint current_time, const std::vector<FileInfo>& infos) { + auto fs = std::make_shared<MockFileSystem>(current_time); + for (const auto& info : infos) { + switch (info.type()) { + case FileType::Directory: + RETURN_NOT_OK(fs->CreateDir(info.path(), /*recursive*/ true)); + break; + case FileType::File: + RETURN_NOT_OK(fs->CreateFile(info.path(), "", /*recursive*/ true)); + break; + default: + break; + } + } + + return fs; +} + +FileInfoGenerator MockAsyncFileSystem::GetFileInfoGenerator(const FileSelector& select) { + auto maybe_infos = GetFileInfo(select); + if (maybe_infos.ok()) { + // Return the FileInfo entries one by one + const auto& infos = *maybe_infos; + std::vector<FileInfoVector> chunks(infos.size()); + std::transform(infos.begin(), infos.end(), chunks.begin(), + [](const FileInfo& info) { return FileInfoVector{info}; }); + return MakeVectorGenerator(std::move(chunks)); + } else { + return MakeFailingGenerator(maybe_infos); + } +} + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/mockfs.h b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/mockfs.h new file mode 100644 index 0000000000..378f30d295 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/mockfs.h @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <iosfwd> +#include <memory> +#include <string> +#include <vector> + +#include "arrow/filesystem/filesystem.h" +#include "arrow/util/string_view.h" +#include "arrow/util/windows_fixup.h" + +namespace arrow { +namespace fs { +namespace internal { + +struct MockDirInfo { + std::string full_path; + TimePoint mtime; + + bool operator==(const MockDirInfo& other) const { + return mtime == other.mtime && full_path == other.full_path; + } + + friend ARROW_EXPORT std::ostream& operator<<(std::ostream&, const MockDirInfo&); +}; + +struct MockFileInfo { + std::string full_path; + TimePoint mtime; + util::string_view data; + + bool operator==(const MockFileInfo& other) const { + return mtime == other.mtime && full_path == other.full_path && data == other.data; + } + + friend ARROW_EXPORT std::ostream& operator<<(std::ostream&, const MockFileInfo&); +}; + +/// A mock FileSystem implementation that holds its contents in memory. +/// +/// Useful for validating the FileSystem API, writing conformance suite, +/// and bootstrapping FileSystem-based APIs. +class ARROW_EXPORT MockFileSystem : public FileSystem { + public: + explicit MockFileSystem(TimePoint current_time, + const io::IOContext& = io::default_io_context()); + ~MockFileSystem() override; + + std::string type_name() const override { return "mock"; } + + bool Equals(const FileSystem& other) const override; + + // XXX It's not very practical to have to explicitly declare inheritance + // of default overrides. + using FileSystem::GetFileInfo; + Result<FileInfo> GetFileInfo(const std::string& path) override; + Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result<std::shared_ptr<io::InputStream>> OpenInputStream( + const std::string& path) override; + Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const std::string& path) override; + Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override; + Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override; + + // Contents-dumping helpers to ease testing. + // Output is lexicographically-ordered by full path. + std::vector<MockDirInfo> AllDirs(); + std::vector<MockFileInfo> AllFiles(); + + // Create a File with a content from a string. + Status CreateFile(const std::string& path, util::string_view content, + bool recursive = true); + + // Create a MockFileSystem out of (empty) FileInfo. The content of every + // file is empty and of size 0. All directories will be created recursively. + static Result<std::shared_ptr<FileSystem>> Make(TimePoint current_time, + const std::vector<FileInfo>& infos); + + class Impl; + + protected: + std::unique_ptr<Impl> impl_; +}; + +class ARROW_EXPORT MockAsyncFileSystem : public MockFileSystem { + public: + explicit MockAsyncFileSystem(TimePoint current_time, + const io::IOContext& io_context = io::default_io_context()) + : MockFileSystem(current_time, io_context) { + default_async_is_sync_ = false; + } + + FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override; +}; + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/path_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/path_util.cc new file mode 100644 index 0000000000..f1bd5c087b --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/path_util.cc @@ -0,0 +1,271 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <algorithm> + +#include "arrow/filesystem/path_util.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/logging.h" +#include "arrow/util/string_view.h" + +namespace arrow { +namespace fs { +namespace internal { + +// XXX How does this encode Windows UNC paths? + +std::vector<std::string> SplitAbstractPath(const std::string& path) { + std::vector<std::string> parts; + auto v = util::string_view(path); + // Strip trailing slash + if (v.length() > 0 && v.back() == kSep) { + v = v.substr(0, v.length() - 1); + } + // Strip leading slash + if (v.length() > 0 && v.front() == kSep) { + v = v.substr(1); + } + if (v.length() == 0) { + return parts; + } + + auto append_part = [&parts, &v](size_t start, size_t end) { + parts.push_back(std::string(v.substr(start, end - start))); + }; + + size_t start = 0; + while (true) { + size_t end = v.find_first_of(kSep, start); + append_part(start, end); + if (end == std::string::npos) { + break; + } + start = end + 1; + } + return parts; +} + +std::pair<std::string, std::string> GetAbstractPathParent(const std::string& s) { + // XXX should strip trailing slash? + + auto pos = s.find_last_of(kSep); + if (pos == std::string::npos) { + // Empty parent + return {{}, s}; + } + return {s.substr(0, pos), s.substr(pos + 1)}; +} + +std::string GetAbstractPathExtension(const std::string& s) { + util::string_view basename(s); + auto offset = basename.find_last_of(kSep); + if (offset != std::string::npos) { + basename = basename.substr(offset); + } + auto dot = basename.find_last_of('.'); + if (dot == util::string_view::npos) { + // Empty extension + return ""; + } + return std::string(basename.substr(dot + 1)); +} + +Status ValidateAbstractPathParts(const std::vector<std::string>& parts) { + for (const auto& part : parts) { + if (part.length() == 0) { + return Status::Invalid("Empty path component"); + } + if (part.find_first_of(kSep) != std::string::npos) { + return Status::Invalid("Separator in component '", part, "'"); + } + } + return Status::OK(); +} + +std::string ConcatAbstractPath(const std::string& base, const std::string& stem) { + DCHECK(!stem.empty()); + if (base.empty()) { + return stem; + } + return EnsureTrailingSlash(base) + std::string(RemoveLeadingSlash(stem)); +} + +std::string EnsureTrailingSlash(util::string_view v) { + if (v.length() > 0 && v.back() != kSep) { + // XXX How about "C:" on Windows? We probably don't want to turn it into "C:/"... + // Unless the local filesystem always uses absolute paths + return std::string(v) + kSep; + } else { + return std::string(v); + } +} + +std::string EnsureLeadingSlash(util::string_view v) { + if (v.length() == 0 || v.front() != kSep) { + // XXX How about "C:" on Windows? We probably don't want to turn it into "/C:"... + return kSep + std::string(v); + } else { + return std::string(v); + } +} +util::string_view RemoveTrailingSlash(util::string_view key) { + while (!key.empty() && key.back() == kSep) { + key.remove_suffix(1); + } + return key; +} + +util::string_view RemoveLeadingSlash(util::string_view key) { + while (!key.empty() && key.front() == kSep) { + key.remove_prefix(1); + } + return key; +} + +Result<std::string> MakeAbstractPathRelative(const std::string& base, + const std::string& path) { + if (base.empty() || base.front() != kSep) { + return Status::Invalid("MakeAbstractPathRelative called with non-absolute base '", + base, "'"); + } + auto b = EnsureLeadingSlash(RemoveTrailingSlash(base)); + auto p = util::string_view(path); + if (p.substr(0, b.size()) != util::string_view(b)) { + return Status::Invalid("Path '", path, "' is not relative to '", base, "'"); + } + p = p.substr(b.size()); + if (!p.empty() && p.front() != kSep && b.back() != kSep) { + return Status::Invalid("Path '", path, "' is not relative to '", base, "'"); + } + return std::string(RemoveLeadingSlash(p)); +} + +bool IsAncestorOf(util::string_view ancestor, util::string_view descendant) { + ancestor = RemoveTrailingSlash(ancestor); + if (ancestor == "") { + // everything is a descendant of the root directory + return true; + } + + descendant = RemoveTrailingSlash(descendant); + if (!descendant.starts_with(ancestor)) { + // an ancestor path is a prefix of descendant paths + return false; + } + + descendant.remove_prefix(ancestor.size()); + + if (descendant.empty()) { + // "/hello" is an ancestor of "/hello" + return true; + } + + // "/hello/w" is not an ancestor of "/hello/world" + return descendant.starts_with(std::string{kSep}); +} + +util::optional<util::string_view> RemoveAncestor(util::string_view ancestor, + util::string_view descendant) { + if (!IsAncestorOf(ancestor, descendant)) { + return util::nullopt; + } + + auto relative_to_ancestor = descendant.substr(ancestor.size()); + return RemoveLeadingSlash(relative_to_ancestor); +} + +std::vector<std::string> AncestorsFromBasePath(util::string_view base_path, + util::string_view descendant) { + std::vector<std::string> ancestry; + if (auto relative = RemoveAncestor(base_path, descendant)) { + auto relative_segments = fs::internal::SplitAbstractPath(std::string(*relative)); + + // the last segment indicates descendant + relative_segments.pop_back(); + + if (relative_segments.empty()) { + // no missing parent + return {}; + } + + for (auto&& relative_segment : relative_segments) { + ancestry.push_back(JoinAbstractPath( + std::vector<std::string>{std::string(base_path), std::move(relative_segment)})); + base_path = ancestry.back(); + } + } + return ancestry; +} + +std::vector<std::string> MinimalCreateDirSet(std::vector<std::string> dirs) { + std::sort(dirs.begin(), dirs.end()); + + for (auto ancestor = dirs.begin(); ancestor != dirs.end(); ++ancestor) { + auto descendant = ancestor; + auto descendants_end = descendant + 1; + + while (descendants_end != dirs.end() && IsAncestorOf(*descendant, *descendants_end)) { + ++descendant; + ++descendants_end; + } + + ancestor = dirs.erase(ancestor, descendants_end - 1); + } + + // the root directory need not be created + if (dirs.size() == 1 && IsAncestorOf(dirs[0], "")) { + return {}; + } + + return dirs; +} + +std::string ToBackslashes(util::string_view v) { + std::string s(v); + for (auto& c : s) { + if (c == '/') { + c = '\\'; + } + } + return s; +} + +std::string ToSlashes(util::string_view v) { + std::string s(v); +#ifdef _WIN32 + for (auto& c : s) { + if (c == '\\') { + c = '/'; + } + } +#endif + return s; +} + +bool IsEmptyPath(util::string_view v) { + for (const auto c : v) { + if (c != '/') { + return false; + } + } + return true; +} + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/path_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/path_util.h new file mode 100644 index 0000000000..5701c11b5d --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/path_util.h @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <string> +#include <utility> +#include <vector> + +#include "arrow/type_fwd.h" +#include "arrow/util/optional.h" +#include "arrow/util/string_view.h" + +namespace arrow { +namespace fs { +namespace internal { + +constexpr char kSep = '/'; + +// Computations on abstract paths (not local paths with system-dependent behaviour). +// Abstract paths are typically used in URIs. + +// Split an abstract path into its individual components. +ARROW_EXPORT +std::vector<std::string> SplitAbstractPath(const std::string& s); + +// Return the extension of the file +ARROW_EXPORT +std::string GetAbstractPathExtension(const std::string& s); + +// Return the parent directory and basename of an abstract path. Both values may be +// empty. +ARROW_EXPORT +std::pair<std::string, std::string> GetAbstractPathParent(const std::string& s); + +// Validate the components of an abstract path. +ARROW_EXPORT +Status ValidateAbstractPathParts(const std::vector<std::string>& parts); + +// Append a non-empty stem to an abstract path. +ARROW_EXPORT +std::string ConcatAbstractPath(const std::string& base, const std::string& stem); + +// Make path relative to base, if it starts with base. Otherwise error out. +ARROW_EXPORT +Result<std::string> MakeAbstractPathRelative(const std::string& base, + const std::string& path); + +ARROW_EXPORT +std::string EnsureLeadingSlash(util::string_view s); + +ARROW_EXPORT +util::string_view RemoveLeadingSlash(util::string_view s); + +ARROW_EXPORT +std::string EnsureTrailingSlash(util::string_view s); + +ARROW_EXPORT +util::string_view RemoveTrailingSlash(util::string_view s); + +ARROW_EXPORT +bool IsAncestorOf(util::string_view ancestor, util::string_view descendant); + +ARROW_EXPORT +util::optional<util::string_view> RemoveAncestor(util::string_view ancestor, + util::string_view descendant); + +/// Return a vector of ancestors between a base path and a descendant. +/// For example, +/// +/// AncestorsFromBasePath("a/b", "a/b/c/d/e") -> ["a/b/c", "a/b/c/d"] +ARROW_EXPORT +std::vector<std::string> AncestorsFromBasePath(util::string_view base_path, + util::string_view descendant); + +/// Given a vector of paths of directories which must be created, produce a the minimal +/// subset for passing to CreateDir(recursive=true) by removing redundant parent +/// directories +ARROW_EXPORT +std::vector<std::string> MinimalCreateDirSet(std::vector<std::string> dirs); + +// Join the components of an abstract path. +template <class StringIt> +std::string JoinAbstractPath(StringIt it, StringIt end) { + std::string path; + for (; it != end; ++it) { + if (it->empty()) continue; + + if (!path.empty()) { + path += kSep; + } + path += *it; + } + return path; +} + +template <class StringRange> +std::string JoinAbstractPath(const StringRange& range) { + return JoinAbstractPath(range.begin(), range.end()); +} + +/// Convert slashes to backslashes, on all platforms. Mostly useful for testing. +ARROW_EXPORT +std::string ToBackslashes(util::string_view s); + +/// Ensure a local path is abstract, by converting backslashes to regular slashes +/// on Windows. Return the path unchanged on other systems. +ARROW_EXPORT +std::string ToSlashes(util::string_view s); + +ARROW_EXPORT +bool IsEmptyPath(util::string_view s); + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/type_fwd.h new file mode 100644 index 0000000000..112563577d --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/type_fwd.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace arrow { +namespace fs { + +/// \brief FileSystem entry type +enum class FileType : int8_t { + /// Entry is not found + NotFound, + /// Entry exists but its type is unknown + /// + /// This can designate a special file such as a Unix socket or character + /// device, or Windows NUL / CON / ... + Unknown, + /// Entry is a regular file + File, + /// Entry is a directory + Directory +}; + +struct FileInfo; + +struct FileSelector; + +class FileSystem; +class SubTreeFileSystem; +class SlowFileSystem; +class LocalFileSystem; +class S3FileSystem; + +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/util_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/util_internal.cc new file mode 100644 index 0000000000..8f86707375 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/util_internal.cc @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/util_internal.h" +#include "arrow/buffer.h" +#include "arrow/result.h" +#include "arrow/status.h" + +namespace arrow { +namespace fs { +namespace internal { + +TimePoint CurrentTimePoint() { + auto now = std::chrono::system_clock::now(); + return TimePoint( + std::chrono::duration_cast<TimePoint::duration>(now.time_since_epoch())); +} + +Status CopyStream(const std::shared_ptr<io::InputStream>& src, + const std::shared_ptr<io::OutputStream>& dest, int64_t chunk_size, + const io::IOContext& io_context) { + ARROW_ASSIGN_OR_RAISE(auto chunk, AllocateBuffer(chunk_size, io_context.pool())); + + while (true) { + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, + src->Read(chunk_size, chunk->mutable_data())); + if (bytes_read == 0) { + // EOF + break; + } + RETURN_NOT_OK(dest->Write(chunk->data(), bytes_read)); + } + + return Status::OK(); +} + +Status PathNotFound(const std::string& path) { + return Status::IOError("Path does not exist '", path, "'"); +} + +Status NotADir(const std::string& path) { + return Status::IOError("Not a directory: '", path, "'"); +} + +Status NotAFile(const std::string& path) { + return Status::IOError("Not a regular file: '", path, "'"); +} + +Status InvalidDeleteDirContents(const std::string& path) { + return Status::Invalid( + "DeleteDirContents called on invalid path '", path, "'. ", + "If you wish to delete the root directory's contents, call DeleteRootDirContents."); +} + +FileSystemGlobalOptions global_options; + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/util_internal.h new file mode 100644 index 0000000000..915c8d03d4 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/filesystem/util_internal.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstdint> +#include <memory> + +#include "arrow/filesystem/filesystem.h" +#include "arrow/io/interfaces.h" +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace fs { +namespace internal { + +ARROW_EXPORT +TimePoint CurrentTimePoint(); + +ARROW_EXPORT +Status CopyStream(const std::shared_ptr<io::InputStream>& src, + const std::shared_ptr<io::OutputStream>& dest, int64_t chunk_size, + const io::IOContext& io_context); + +ARROW_EXPORT +Status PathNotFound(const std::string& path); + +ARROW_EXPORT +Status NotADir(const std::string& path); + +ARROW_EXPORT +Status NotAFile(const std::string& path); + +ARROW_EXPORT +Status InvalidDeleteDirContents(const std::string& path); + +extern FileSystemGlobalOptions global_options; + +} // namespace internal +} // namespace fs +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.cc new file mode 100644 index 0000000000..117b82df30 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.cc @@ -0,0 +1,940 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <cstdint> +#include <sstream> +#include <type_traits> +#include <utility> +#include <vector> + +#include "arrow/array/array_dict.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_decimal.h" +#include "arrow/array/builder_dict.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/array/builder_time.h" +#include "arrow/array/builder_union.h" +#include "arrow/ipc/json_simple.h" +#include "arrow/scalar.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" +#include "arrow/util/string_view.h" +#include "arrow/util/value_parsing.h" + +#include "arrow/json/rapidjson_defs.h" + +#include <rapidjson/document.h> +#include <rapidjson/error/en.h> +#include <rapidjson/rapidjson.h> +#include <rapidjson/reader.h> +#include <rapidjson/writer.h> + +namespace rj = arrow::rapidjson; + +namespace arrow { + +using internal::ParseValue; + +namespace ipc { +namespace internal { +namespace json { + +using ::arrow::internal::checked_cast; +using ::arrow::internal::checked_pointer_cast; + +namespace { + +constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; + +Status JSONTypeError(const char* expected_type, rj::Type json_type) { + return Status::Invalid("Expected ", expected_type, " or null, got JSON type ", + json_type); +} + +class Converter { + public: + virtual ~Converter() = default; + + virtual Status Init() { return Status::OK(); } + + virtual Status AppendValue(const rj::Value& json_obj) = 0; + + Status AppendNull() { return this->builder()->AppendNull(); } + + virtual Status AppendValues(const rj::Value& json_array) = 0; + + virtual std::shared_ptr<ArrayBuilder> builder() = 0; + + virtual Status Finish(std::shared_ptr<Array>* out) { + auto builder = this->builder(); + if (builder->length() == 0) { + // Make sure the builder was initialized + RETURN_NOT_OK(builder->Resize(1)); + } + return builder->Finish(out); + } + + protected: + std::shared_ptr<DataType> type_; +}; + +Status GetConverter(const std::shared_ptr<DataType>&, std::shared_ptr<Converter>* out); + +// CRTP +template <class Derived> +class ConcreteConverter : public Converter { + public: + Status AppendValues(const rj::Value& json_array) override { + auto self = static_cast<Derived*>(this); + if (!json_array.IsArray()) { + return JSONTypeError("array", json_array.GetType()); + } + auto size = json_array.Size(); + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(self->AppendValue(json_array[i])); + } + return Status::OK(); + } + + const std::shared_ptr<DataType>& value_type() { + if (type_->id() != Type::DICTIONARY) { + return type_; + } + return checked_cast<const DictionaryType&>(*type_).value_type(); + } + + template <typename BuilderType> + Status MakeConcreteBuilder(std::shared_ptr<BuilderType>* out) { + std::unique_ptr<ArrayBuilder> builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), this->type_, &builder)); + *out = checked_pointer_cast<BuilderType>(std::move(builder)); + DCHECK(*out); + return Status::OK(); + } +}; + +// ------------------------------------------------------------------------ +// Converter for null arrays + +class NullConverter final : public ConcreteConverter<NullConverter> { + public: + explicit NullConverter(const std::shared_ptr<DataType>& type) { + type_ = type; + builder_ = std::make_shared<NullBuilder>(); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + return JSONTypeError("null", json_obj.GetType()); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<NullBuilder> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for boolean arrays + +class BooleanConverter final : public ConcreteConverter<BooleanConverter> { + public: + explicit BooleanConverter(const std::shared_ptr<DataType>& type) { + type_ = type; + builder_ = std::make_shared<BooleanBuilder>(); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + if (json_obj.IsBool()) { + return builder_->Append(json_obj.GetBool()); + } + if (json_obj.IsInt()) { + return builder_->Append(json_obj.GetInt() != 0); + } + return JSONTypeError("boolean", json_obj.GetType()); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<BooleanBuilder> builder_; +}; + +// ------------------------------------------------------------------------ +// Helpers for numeric converters + +// Convert single signed integer value (also {Date,Time}{32,64} and Timestamp) +template <typename T> +enable_if_physical_signed_integer<T, Status> ConvertNumber(const rj::Value& json_obj, + const DataType& type, + typename T::c_type* out) { + if (json_obj.IsInt64()) { + int64_t v64 = json_obj.GetInt64(); + *out = static_cast<typename T::c_type>(v64); + if (*out == v64) { + return Status::OK(); + } else { + return Status::Invalid("Value ", v64, " out of bounds for ", type); + } + } else { + *out = static_cast<typename T::c_type>(0); + return JSONTypeError("signed int", json_obj.GetType()); + } +} + +// Convert single unsigned integer value +template <typename T> +enable_if_physical_unsigned_integer<T, Status> ConvertNumber(const rj::Value& json_obj, + const DataType& type, + typename T::c_type* out) { + if (json_obj.IsUint64()) { + uint64_t v64 = json_obj.GetUint64(); + *out = static_cast<typename T::c_type>(v64); + if (*out == v64) { + return Status::OK(); + } else { + return Status::Invalid("Value ", v64, " out of bounds for ", type); + } + } else { + *out = static_cast<typename T::c_type>(0); + return JSONTypeError("unsigned int", json_obj.GetType()); + } +} + +// Convert single floating point value +template <typename T> +enable_if_physical_floating_point<T, Status> ConvertNumber(const rj::Value& json_obj, + const DataType& type, + typename T::c_type* out) { + if (json_obj.IsNumber()) { + *out = static_cast<typename T::c_type>(json_obj.GetDouble()); + return Status::OK(); + } else { + *out = static_cast<typename T::c_type>(0); + return JSONTypeError("number", json_obj.GetType()); + } +} + +// ------------------------------------------------------------------------ +// Converter for int arrays + +template <typename Type, typename BuilderType = typename TypeTraits<Type>::BuilderType> +class IntegerConverter final + : public ConcreteConverter<IntegerConverter<Type, BuilderType>> { + using c_type = typename Type::c_type; + + static constexpr auto is_signed = std::is_signed<c_type>::value; + + public: + explicit IntegerConverter(const std::shared_ptr<DataType>& type) { this->type_ = type; } + + Status Init() override { return this->MakeConcreteBuilder(&builder_); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + c_type value; + RETURN_NOT_OK(ConvertNumber<Type>(json_obj, *this->type_, &value)); + return builder_->Append(value); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<BuilderType> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for float arrays + +template <typename Type, typename BuilderType = typename TypeTraits<Type>::BuilderType> +class FloatConverter final : public ConcreteConverter<FloatConverter<Type, BuilderType>> { + using c_type = typename Type::c_type; + + public: + explicit FloatConverter(const std::shared_ptr<DataType>& type) { this->type_ = type; } + + Status Init() override { return this->MakeConcreteBuilder(&builder_); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + c_type value; + RETURN_NOT_OK(ConvertNumber<Type>(json_obj, *this->type_, &value)); + return builder_->Append(value); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<BuilderType> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for decimal arrays + +template <typename DecimalSubtype, typename DecimalValue, typename BuilderType> +class DecimalConverter final + : public ConcreteConverter< + DecimalConverter<DecimalSubtype, DecimalValue, BuilderType>> { + public: + explicit DecimalConverter(const std::shared_ptr<DataType>& type) { + this->type_ = type; + decimal_type_ = &checked_cast<const DecimalSubtype&>(*this->value_type()); + } + + Status Init() override { return this->MakeConcreteBuilder(&builder_); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + if (json_obj.IsString()) { + int32_t precision, scale; + DecimalValue d; + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + RETURN_NOT_OK(DecimalValue::FromString(view, &d, &precision, &scale)); + if (scale != decimal_type_->scale()) { + return Status::Invalid("Invalid scale for decimal: expected ", + decimal_type_->scale(), ", got ", scale); + } + return builder_->Append(d); + } + return JSONTypeError("decimal string", json_obj.GetType()); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<BuilderType> builder_; + const DecimalSubtype* decimal_type_; +}; + +template <typename BuilderType = typename TypeTraits<Decimal128Type>::BuilderType> +using Decimal128Converter = DecimalConverter<Decimal128Type, Decimal128, BuilderType>; +template <typename BuilderType = typename TypeTraits<Decimal256Type>::BuilderType> +using Decimal256Converter = DecimalConverter<Decimal256Type, Decimal256, BuilderType>; + +// ------------------------------------------------------------------------ +// Converter for timestamp arrays + +class TimestampConverter final : public ConcreteConverter<TimestampConverter> { + public: + explicit TimestampConverter(const std::shared_ptr<DataType>& type) + : timestamp_type_{checked_cast<const TimestampType*>(type.get())} { + this->type_ = type; + builder_ = std::make_shared<TimestampBuilder>(type, default_memory_pool()); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + int64_t value; + if (json_obj.IsNumber()) { + RETURN_NOT_OK(ConvertNumber<Int64Type>(json_obj, *this->type_, &value)); + } else if (json_obj.IsString()) { + util::string_view view(json_obj.GetString(), json_obj.GetStringLength()); + if (!ParseValue(*timestamp_type_, view.data(), view.size(), &value)) { + return Status::Invalid("couldn't parse timestamp from ", view); + } + } else { + return JSONTypeError("timestamp", json_obj.GetType()); + } + return builder_->Append(value); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + const TimestampType* timestamp_type_; + std::shared_ptr<TimestampBuilder> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for day-time interval arrays + +class DayTimeIntervalConverter final + : public ConcreteConverter<DayTimeIntervalConverter> { + public: + explicit DayTimeIntervalConverter(const std::shared_ptr<DataType>& type) { + this->type_ = type; + builder_ = std::make_shared<DayTimeIntervalBuilder>(default_memory_pool()); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + DayTimeIntervalType::DayMilliseconds value; + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + if (json_obj.Size() != 2) { + return Status::Invalid( + "day time interval pair must have exactly two elements, had ", json_obj.Size()); + } + RETURN_NOT_OK(ConvertNumber<Int32Type>(json_obj[0], *this->type_, &value.days)); + RETURN_NOT_OK( + ConvertNumber<Int32Type>(json_obj[1], *this->type_, &value.milliseconds)); + return builder_->Append(value); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<DayTimeIntervalBuilder> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for binary and string arrays + +template <typename Type, typename BuilderType = typename TypeTraits<Type>::BuilderType> +class StringConverter final + : public ConcreteConverter<StringConverter<Type, BuilderType>> { + public: + explicit StringConverter(const std::shared_ptr<DataType>& type) { this->type_ = type; } + + Status Init() override { return this->MakeConcreteBuilder(&builder_); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + if (json_obj.IsString()) { + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + return builder_->Append(view); + } else { + return JSONTypeError("string", json_obj.GetType()); + } + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<BuilderType> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for fixed-size binary arrays + +template <typename BuilderType = typename TypeTraits<FixedSizeBinaryType>::BuilderType> +class FixedSizeBinaryConverter final + : public ConcreteConverter<FixedSizeBinaryConverter<BuilderType>> { + public: + explicit FixedSizeBinaryConverter(const std::shared_ptr<DataType>& type) { + this->type_ = type; + } + + Status Init() override { return this->MakeConcreteBuilder(&builder_); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + if (json_obj.IsString()) { + auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); + if (view.length() != static_cast<size_t>(builder_->byte_width())) { + std::stringstream ss; + ss << "Invalid string length " << view.length() << " in JSON input for " + << this->type_->ToString(); + return Status::Invalid(ss.str()); + } + return builder_->Append(view); + } else { + return JSONTypeError("string", json_obj.GetType()); + } + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<BuilderType> builder_; +}; + +// ------------------------------------------------------------------------ +// Converter for list arrays + +template <typename TYPE> +class ListConverter final : public ConcreteConverter<ListConverter<TYPE>> { + public: + using BuilderType = typename TypeTraits<TYPE>::BuilderType; + + explicit ListConverter(const std::shared_ptr<DataType>& type) { this->type_ = type; } + + Status Init() override { + const auto& list_type = checked_cast<const TYPE&>(*this->type_); + RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); + auto child_builder = child_converter_->builder(); + builder_ = + std::make_shared<BuilderType>(default_memory_pool(), child_builder, this->type_); + return Status::OK(); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + RETURN_NOT_OK(builder_->Append()); + // Extend the child converter with this JSON array + return child_converter_->AppendValues(json_obj); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<BuilderType> builder_; + std::shared_ptr<Converter> child_converter_; +}; + +// ------------------------------------------------------------------------ +// Converter for map arrays + +class MapConverter final : public ConcreteConverter<MapConverter> { + public: + explicit MapConverter(const std::shared_ptr<DataType>& type) { type_ = type; } + + Status Init() override { + const auto& map_type = checked_cast<const MapType&>(*type_); + RETURN_NOT_OK(GetConverter(map_type.key_type(), &key_converter_)); + RETURN_NOT_OK(GetConverter(map_type.item_type(), &item_converter_)); + auto key_builder = key_converter_->builder(); + auto item_builder = item_converter_->builder(); + builder_ = std::make_shared<MapBuilder>(default_memory_pool(), key_builder, + item_builder, type_); + return Status::OK(); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + RETURN_NOT_OK(builder_->Append()); + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + auto size = json_obj.Size(); + for (uint32_t i = 0; i < size; ++i) { + const auto& json_pair = json_obj[i]; + if (!json_pair.IsArray()) { + return JSONTypeError("array", json_pair.GetType()); + } + if (json_pair.Size() != 2) { + return Status::Invalid("key item pair must have exactly two elements, had ", + json_pair.Size()); + } + if (json_pair[0].IsNull()) { + return Status::Invalid("null key is invalid"); + } + RETURN_NOT_OK(key_converter_->AppendValue(json_pair[0])); + RETURN_NOT_OK(item_converter_->AppendValue(json_pair[1])); + } + return Status::OK(); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<MapBuilder> builder_; + std::shared_ptr<Converter> key_converter_, item_converter_; +}; + +// ------------------------------------------------------------------------ +// Converter for fixed size list arrays + +class FixedSizeListConverter final : public ConcreteConverter<FixedSizeListConverter> { + public: + explicit FixedSizeListConverter(const std::shared_ptr<DataType>& type) { type_ = type; } + + Status Init() override { + const auto& list_type = checked_cast<const FixedSizeListType&>(*type_); + list_size_ = list_type.list_size(); + RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); + auto child_builder = child_converter_->builder(); + builder_ = std::make_shared<FixedSizeListBuilder>(default_memory_pool(), + child_builder, type_); + return Status::OK(); + } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + RETURN_NOT_OK(builder_->Append()); + // Extend the child converter with this JSON array + RETURN_NOT_OK(child_converter_->AppendValues(json_obj)); + if (json_obj.GetArray().Size() != static_cast<rj::SizeType>(list_size_)) { + return Status::Invalid("incorrect list size ", json_obj.GetArray().Size()); + } + return Status::OK(); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + int32_t list_size_; + std::shared_ptr<FixedSizeListBuilder> builder_; + std::shared_ptr<Converter> child_converter_; +}; + +// ------------------------------------------------------------------------ +// Converter for struct arrays + +class StructConverter final : public ConcreteConverter<StructConverter> { + public: + explicit StructConverter(const std::shared_ptr<DataType>& type) { type_ = type; } + + Status Init() override { + std::vector<std::shared_ptr<ArrayBuilder>> child_builders; + for (const auto& field : type_->fields()) { + std::shared_ptr<Converter> child_converter; + RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); + child_converters_.push_back(child_converter); + child_builders.push_back(child_converter->builder()); + } + builder_ = std::make_shared<StructBuilder>(type_, default_memory_pool(), + std::move(child_builders)); + return Status::OK(); + } + + // Append a JSON value that is either an array of N elements in order + // or an object mapping struct names to values (omitted struct members + // are mapped to null). + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + if (json_obj.IsArray()) { + auto size = json_obj.Size(); + auto expected_size = static_cast<uint32_t>(type_->num_fields()); + if (size != expected_size) { + return Status::Invalid("Expected array of size ", expected_size, + ", got array of size ", size); + } + for (uint32_t i = 0; i < size; ++i) { + RETURN_NOT_OK(child_converters_[i]->AppendValue(json_obj[i])); + } + return builder_->Append(); + } + if (json_obj.IsObject()) { + auto remaining = json_obj.MemberCount(); + auto num_children = type_->num_fields(); + for (int32_t i = 0; i < num_children; ++i) { + const auto& field = type_->field(i); + auto it = json_obj.FindMember(field->name()); + if (it != json_obj.MemberEnd()) { + --remaining; + RETURN_NOT_OK(child_converters_[i]->AppendValue(it->value)); + } else { + RETURN_NOT_OK(child_converters_[i]->AppendNull()); + } + } + if (remaining > 0) { + rj::StringBuffer sb; + rj::Writer<rj::StringBuffer> writer(sb); + json_obj.Accept(writer); + return Status::Invalid("Unexpected members in JSON object for type ", + type_->ToString(), " Object: ", sb.GetString()); + } + return builder_->Append(); + } + return JSONTypeError("array or object", json_obj.GetType()); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + std::shared_ptr<StructBuilder> builder_; + std::vector<std::shared_ptr<Converter>> child_converters_; +}; + +// ------------------------------------------------------------------------ +// Converter for union arrays + +class UnionConverter final : public ConcreteConverter<UnionConverter> { + public: + explicit UnionConverter(const std::shared_ptr<DataType>& type) { type_ = type; } + + Status Init() override { + auto union_type = checked_cast<const UnionType*>(type_.get()); + mode_ = union_type->mode(); + type_id_to_child_num_.clear(); + type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1); + int child_i = 0; + for (auto type_id : union_type->type_codes()) { + type_id_to_child_num_[type_id] = child_i++; + } + std::vector<std::shared_ptr<ArrayBuilder>> child_builders; + for (const auto& field : type_->fields()) { + std::shared_ptr<Converter> child_converter; + RETURN_NOT_OK(GetConverter(field->type(), &child_converter)); + child_converters_.push_back(child_converter); + child_builders.push_back(child_converter->builder()); + } + if (mode_ == UnionMode::DENSE) { + builder_ = std::make_shared<DenseUnionBuilder>(default_memory_pool(), + std::move(child_builders), type_); + } else { + builder_ = std::make_shared<SparseUnionBuilder>(default_memory_pool(), + std::move(child_builders), type_); + } + return Status::OK(); + } + + // Append a JSON value that must be a 2-long array, containing the type_id + // and value of the UnionArray's slot. + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return this->AppendNull(); + } + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + if (json_obj.Size() != 2) { + return Status::Invalid("Expected [type_id, value] pair, got array of size ", + json_obj.Size()); + } + const auto& id_obj = json_obj[0]; + if (!id_obj.IsInt()) { + return JSONTypeError("int", id_obj.GetType()); + } + + auto id = static_cast<int8_t>(id_obj.GetInt()); + auto child_num = type_id_to_child_num_[id]; + if (child_num == -1) { + return Status::Invalid("type_id ", id, " not found in ", *type_); + } + + auto child_converter = child_converters_[child_num]; + if (mode_ == UnionMode::SPARSE) { + RETURN_NOT_OK(checked_cast<SparseUnionBuilder&>(*builder_).Append(id)); + for (auto&& other_converter : child_converters_) { + if (other_converter != child_converter) { + RETURN_NOT_OK(other_converter->AppendNull()); + } + } + } else { + RETURN_NOT_OK(checked_cast<DenseUnionBuilder&>(*builder_).Append(id)); + } + return child_converter->AppendValue(json_obj[1]); + } + + std::shared_ptr<ArrayBuilder> builder() override { return builder_; } + + private: + UnionMode::type mode_; + std::shared_ptr<ArrayBuilder> builder_; + std::vector<std::shared_ptr<Converter>> child_converters_; + std::vector<int8_t> type_id_to_child_num_; +}; + +// ------------------------------------------------------------------------ +// General conversion functions + +Status ConversionNotImplemented(const std::shared_ptr<DataType>& type) { + return Status::NotImplemented("JSON conversion to ", type->ToString(), + " not implemented"); +} + +Status GetDictConverter(const std::shared_ptr<DataType>& type, + std::shared_ptr<Converter>* out) { + std::shared_ptr<Converter> res; + + const auto value_type = checked_cast<const DictionaryType&>(*type).value_type(); + +#define SIMPLE_CONVERTER_CASE(ID, CLASS, TYPE) \ + case ID: \ + res = std::make_shared<CLASS<DictionaryBuilder<TYPE>>>(type); \ + break; + +#define PARAM_CONVERTER_CASE(ID, CLASS, TYPE) \ + case ID: \ + res = std::make_shared<CLASS<TYPE, DictionaryBuilder<TYPE>>>(type); \ + break; + + switch (value_type->id()) { + PARAM_CONVERTER_CASE(Type::INT8, IntegerConverter, Int8Type) + PARAM_CONVERTER_CASE(Type::INT16, IntegerConverter, Int16Type) + PARAM_CONVERTER_CASE(Type::INT32, IntegerConverter, Int32Type) + PARAM_CONVERTER_CASE(Type::INT64, IntegerConverter, Int64Type) + PARAM_CONVERTER_CASE(Type::UINT8, IntegerConverter, UInt8Type) + PARAM_CONVERTER_CASE(Type::UINT16, IntegerConverter, UInt16Type) + PARAM_CONVERTER_CASE(Type::UINT32, IntegerConverter, UInt32Type) + PARAM_CONVERTER_CASE(Type::UINT64, IntegerConverter, UInt64Type) + PARAM_CONVERTER_CASE(Type::FLOAT, FloatConverter, FloatType) + PARAM_CONVERTER_CASE(Type::DOUBLE, FloatConverter, DoubleType) + PARAM_CONVERTER_CASE(Type::STRING, StringConverter, StringType) + PARAM_CONVERTER_CASE(Type::BINARY, StringConverter, BinaryType) + PARAM_CONVERTER_CASE(Type::LARGE_STRING, StringConverter, LargeStringType) + PARAM_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter, LargeBinaryType) + SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter, + FixedSizeBinaryType) + SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter, Decimal128Type) + SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter, Decimal256Type) + default: + return ConversionNotImplemented(type); + } + +#undef SIMPLE_CONVERTER_CASE +#undef PARAM_CONVERTER_CASE + + RETURN_NOT_OK(res->Init()); + *out = res; + return Status::OK(); +} + +Status GetConverter(const std::shared_ptr<DataType>& type, + std::shared_ptr<Converter>* out) { + if (type->id() == Type::DICTIONARY) { + return GetDictConverter(type, out); + } + + std::shared_ptr<Converter> res; + +#define SIMPLE_CONVERTER_CASE(ID, CLASS) \ + case ID: \ + res = std::make_shared<CLASS>(type); \ + break; + + switch (type->id()) { + SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter<Int8Type>) + SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter<Int16Type>) + SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter<Int32Type>) + SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter<Int64Type>) + SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter<UInt8Type>) + SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter<UInt16Type>) + SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter<UInt32Type>) + SIMPLE_CONVERTER_CASE(Type::UINT64, IntegerConverter<UInt64Type>) + SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter) + SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter<Date32Type>) + SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter<Date64Type>) + SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter<Time32Type>) + SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter<Time64Type>) + SIMPLE_CONVERTER_CASE(Type::DURATION, IntegerConverter<DurationType>) + SIMPLE_CONVERTER_CASE(Type::NA, NullConverter) + SIMPLE_CONVERTER_CASE(Type::BOOL, BooleanConverter) + SIMPLE_CONVERTER_CASE(Type::HALF_FLOAT, IntegerConverter<HalfFloatType>) + SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter<FloatType>) + SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter<DoubleType>) + SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter<ListType>) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, ListConverter<LargeListType>) + SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter) + SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter) + SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) + SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter<StringType>) + SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter<BinaryType>) + SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter<LargeStringType>) + SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter<LargeBinaryType>) + SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter<>) + SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter<>) + SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter<>) + SIMPLE_CONVERTER_CASE(Type::SPARSE_UNION, UnionConverter) + SIMPLE_CONVERTER_CASE(Type::DENSE_UNION, UnionConverter) + SIMPLE_CONVERTER_CASE(Type::INTERVAL_MONTHS, IntegerConverter<MonthIntervalType>) + SIMPLE_CONVERTER_CASE(Type::INTERVAL_DAY_TIME, DayTimeIntervalConverter) + default: + return ConversionNotImplemented(type); + } + +#undef SIMPLE_CONVERTER_CASE + + RETURN_NOT_OK(res->Init()); + *out = res; + return Status::OK(); +} + +} // namespace + +Status ArrayFromJSON(const std::shared_ptr<DataType>& type, util::string_view json_string, + std::shared_ptr<Array>* out) { + std::shared_ptr<Converter> converter; + RETURN_NOT_OK(GetConverter(type, &converter)); + + rj::Document json_doc; + json_doc.Parse<kParseFlags>(json_string.data(), json_string.length()); + if (json_doc.HasParseError()) { + return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", + GetParseError_En(json_doc.GetParseError())); + } + + // The JSON document should be an array, append it + RETURN_NOT_OK(converter->AppendValues(json_doc)); + return converter->Finish(out); +} + +Status ArrayFromJSON(const std::shared_ptr<DataType>& type, + const std::string& json_string, std::shared_ptr<Array>* out) { + return ArrayFromJSON(type, util::string_view(json_string), out); +} + +Status ArrayFromJSON(const std::shared_ptr<DataType>& type, const char* json_string, + std::shared_ptr<Array>* out) { + return ArrayFromJSON(type, util::string_view(json_string), out); +} + +Status DictArrayFromJSON(const std::shared_ptr<DataType>& type, + util::string_view indices_json, + util::string_view dictionary_json, std::shared_ptr<Array>* out) { + if (type->id() != Type::DICTIONARY) { + return Status::TypeError("DictArrayFromJSON requires dictionary type, got ", *type); + } + + const auto& dictionary_type = checked_cast<const DictionaryType&>(*type); + + std::shared_ptr<Array> indices, dictionary; + RETURN_NOT_OK(ArrayFromJSON(dictionary_type.index_type(), indices_json, &indices)); + RETURN_NOT_OK( + ArrayFromJSON(dictionary_type.value_type(), dictionary_json, &dictionary)); + + return DictionaryArray::FromArrays(type, std::move(indices), std::move(dictionary)) + .Value(out); +} + +Status ScalarFromJSON(const std::shared_ptr<DataType>& type, + util::string_view json_string, std::shared_ptr<Scalar>* out) { + std::shared_ptr<Converter> converter; + RETURN_NOT_OK(GetConverter(type, &converter)); + + rj::Document json_doc; + json_doc.Parse<kParseFlags>(json_string.data(), json_string.length()); + if (json_doc.HasParseError()) { + return Status::Invalid("JSON parse error at offset ", json_doc.GetErrorOffset(), ": ", + GetParseError_En(json_doc.GetParseError())); + } + + std::shared_ptr<Array> array; + RETURN_NOT_OK(converter->AppendValue(json_doc)); + RETURN_NOT_OK(converter->Finish(&array)); + DCHECK_EQ(array->length(), 1); + ARROW_ASSIGN_OR_RAISE(*out, array->GetScalar(0)); + return Status::OK(); +} + +} // namespace json +} // namespace internal +} // namespace ipc +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/chunked_builder.cc b/contrib/libs/apache/arrow/cpp/src/arrow/json/chunked_builder.cc new file mode 100644 index 0000000000..040009c764 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/chunked_builder.cc @@ -0,0 +1,469 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/chunked_builder.h" + +#include <mutex> +#include <string> +#include <unordered_map> +#include <utility> +#include <vector> + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/json/converter.h" +#include "arrow/table.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" +#include "arrow/util/task_group.h" + +namespace arrow { + +using internal::checked_cast; +using internal::TaskGroup; + +namespace json { + +class NonNestedChunkedArrayBuilder : public ChunkedArrayBuilder { + public: + NonNestedChunkedArrayBuilder(const std::shared_ptr<TaskGroup>& task_group, + std::shared_ptr<Converter> converter) + : ChunkedArrayBuilder(task_group), converter_(std::move(converter)) {} + + Status Finish(std::shared_ptr<ChunkedArray>* out) override { + RETURN_NOT_OK(task_group_->Finish()); + *out = std::make_shared<ChunkedArray>(std::move(chunks_), converter_->out_type()); + chunks_.clear(); + return Status::OK(); + } + + Status ReplaceTaskGroup(const std::shared_ptr<TaskGroup>& task_group) override { + RETURN_NOT_OK(task_group_->Finish()); + task_group_ = task_group; + return Status::OK(); + } + + protected: + ArrayVector chunks_; + std::mutex mutex_; + std::shared_ptr<Converter> converter_; +}; + +class TypedChunkedArrayBuilder + : public NonNestedChunkedArrayBuilder, + public std::enable_shared_from_this<TypedChunkedArrayBuilder> { + public: + using NonNestedChunkedArrayBuilder::NonNestedChunkedArrayBuilder; + + void Insert(int64_t block_index, const std::shared_ptr<Field>&, + const std::shared_ptr<Array>& unconverted) override { + std::unique_lock<std::mutex> lock(mutex_); + if (chunks_.size() <= static_cast<size_t>(block_index)) { + chunks_.resize(static_cast<size_t>(block_index) + 1, nullptr); + } + lock.unlock(); + + auto self = shared_from_this(); + + task_group_->Append([self, block_index, unconverted] { + std::shared_ptr<Array> converted; + RETURN_NOT_OK(self->converter_->Convert(unconverted, &converted)); + std::unique_lock<std::mutex> lock(self->mutex_); + self->chunks_[block_index] = std::move(converted); + return Status::OK(); + }); + } +}; + +class InferringChunkedArrayBuilder + : public NonNestedChunkedArrayBuilder, + public std::enable_shared_from_this<InferringChunkedArrayBuilder> { + public: + InferringChunkedArrayBuilder(const std::shared_ptr<TaskGroup>& task_group, + const PromotionGraph* promotion_graph, + std::shared_ptr<Converter> converter) + : NonNestedChunkedArrayBuilder(task_group, std::move(converter)), + promotion_graph_(promotion_graph) {} + + void Insert(int64_t block_index, const std::shared_ptr<Field>& unconverted_field, + const std::shared_ptr<Array>& unconverted) override { + std::unique_lock<std::mutex> lock(mutex_); + if (chunks_.size() <= static_cast<size_t>(block_index)) { + chunks_.resize(static_cast<size_t>(block_index) + 1, nullptr); + unconverted_.resize(chunks_.size(), nullptr); + unconverted_fields_.resize(chunks_.size(), nullptr); + } + unconverted_[block_index] = unconverted; + unconverted_fields_[block_index] = unconverted_field; + lock.unlock(); + ScheduleConvertChunk(block_index); + } + + void ScheduleConvertChunk(int64_t block_index) { + auto self = shared_from_this(); + task_group_->Append([self, block_index] { + return self->TryConvertChunk(static_cast<size_t>(block_index)); + }); + } + + Status TryConvertChunk(size_t block_index) { + std::unique_lock<std::mutex> lock(mutex_); + auto converter = converter_; + auto unconverted = unconverted_[block_index]; + auto unconverted_field = unconverted_fields_[block_index]; + std::shared_ptr<Array> converted; + + lock.unlock(); + Status st = converter->Convert(unconverted, &converted); + lock.lock(); + + if (converter != converter_) { + // another task promoted converter; reconvert + lock.unlock(); + ScheduleConvertChunk(block_index); + return Status::OK(); + } + + if (st.ok()) { + // conversion succeeded + chunks_[block_index] = std::move(converted); + return Status::OK(); + } + + auto promoted_type = + promotion_graph_->Promote(converter_->out_type(), unconverted_field); + if (promoted_type == nullptr) { + // converter failed, no promotion available + return st; + } + RETURN_NOT_OK(MakeConverter(promoted_type, converter_->pool(), &converter_)); + + size_t nchunks = chunks_.size(); + for (size_t i = 0; i < nchunks; ++i) { + if (i != block_index && chunks_[i]) { + // We're assuming the chunk was converted using the wrong type + // (which should be true unless the executor reorders tasks) + chunks_[i].reset(); + lock.unlock(); + ScheduleConvertChunk(i); + lock.lock(); + } + } + lock.unlock(); + ScheduleConvertChunk(block_index); + return Status::OK(); + } + + Status Finish(std::shared_ptr<ChunkedArray>* out) override { + RETURN_NOT_OK(NonNestedChunkedArrayBuilder::Finish(out)); + unconverted_.clear(); + return Status::OK(); + } + + private: + ArrayVector unconverted_; + std::vector<std::shared_ptr<Field>> unconverted_fields_; + const PromotionGraph* promotion_graph_; +}; + +class ChunkedListArrayBuilder : public ChunkedArrayBuilder { + public: + ChunkedListArrayBuilder(const std::shared_ptr<TaskGroup>& task_group, MemoryPool* pool, + std::shared_ptr<ChunkedArrayBuilder> value_builder, + const std::shared_ptr<Field>& value_field) + : ChunkedArrayBuilder(task_group), + pool_(pool), + value_builder_(std::move(value_builder)), + value_field_(value_field) {} + + Status ReplaceTaskGroup(const std::shared_ptr<TaskGroup>& task_group) override { + RETURN_NOT_OK(task_group_->Finish()); + RETURN_NOT_OK(value_builder_->ReplaceTaskGroup(task_group)); + task_group_ = task_group; + return Status::OK(); + } + + void Insert(int64_t block_index, const std::shared_ptr<Field>&, + const std::shared_ptr<Array>& unconverted) override { + std::unique_lock<std::mutex> lock(mutex_); + + if (unconverted->type_id() == Type::NA) { + auto st = InsertNull(block_index, unconverted->length()); + if (!st.ok()) { + task_group_->Append([st] { return st; }); + } + return; + } + + DCHECK_EQ(unconverted->type_id(), Type::LIST); + const auto& list_array = checked_cast<const ListArray&>(*unconverted); + + if (null_bitmap_chunks_.size() <= static_cast<size_t>(block_index)) { + null_bitmap_chunks_.resize(static_cast<size_t>(block_index) + 1, nullptr); + offset_chunks_.resize(null_bitmap_chunks_.size(), nullptr); + } + null_bitmap_chunks_[block_index] = unconverted->null_bitmap(); + offset_chunks_[block_index] = list_array.value_offsets(); + + value_builder_->Insert(block_index, list_array.list_type()->value_field(), + list_array.values()); + } + + Status Finish(std::shared_ptr<ChunkedArray>* out) override { + RETURN_NOT_OK(task_group_->Finish()); + + std::shared_ptr<ChunkedArray> value_array; + RETURN_NOT_OK(value_builder_->Finish(&value_array)); + + auto type = list(value_field_->WithType(value_array->type())->WithMetadata(nullptr)); + ArrayVector chunks(null_bitmap_chunks_.size()); + for (size_t i = 0; i < null_bitmap_chunks_.size(); ++i) { + auto value_chunk = value_array->chunk(static_cast<int>(i)); + auto length = offset_chunks_[i]->size() / sizeof(int32_t) - 1; + chunks[i] = std::make_shared<ListArray>(type, length, offset_chunks_[i], + value_chunk, null_bitmap_chunks_[i]); + } + + *out = std::make_shared<ChunkedArray>(std::move(chunks), type); + return Status::OK(); + } + + private: + // call from Insert() only, with mutex_ locked + Status InsertNull(int64_t block_index, int64_t length) { + value_builder_->Insert(block_index, value_field_, std::make_shared<NullArray>(0)); + + ARROW_ASSIGN_OR_RAISE(null_bitmap_chunks_[block_index], + AllocateEmptyBitmap(length, pool_)); + + int64_t offsets_length = (length + 1) * sizeof(int32_t); + ARROW_ASSIGN_OR_RAISE(offset_chunks_[block_index], + AllocateBuffer(offsets_length, pool_)); + std::memset(offset_chunks_[block_index]->mutable_data(), 0, offsets_length); + + return Status::OK(); + } + + std::mutex mutex_; + MemoryPool* pool_; + std::shared_ptr<ChunkedArrayBuilder> value_builder_; + BufferVector offset_chunks_, null_bitmap_chunks_; + std::shared_ptr<Field> value_field_; +}; + +class ChunkedStructArrayBuilder : public ChunkedArrayBuilder { + public: + ChunkedStructArrayBuilder( + const std::shared_ptr<TaskGroup>& task_group, MemoryPool* pool, + const PromotionGraph* promotion_graph, + std::vector<std::pair<std::string, std::shared_ptr<ChunkedArrayBuilder>>> + name_builders) + : ChunkedArrayBuilder(task_group), pool_(pool), promotion_graph_(promotion_graph) { + for (auto&& name_builder : name_builders) { + auto index = static_cast<int>(name_to_index_.size()); + name_to_index_.emplace(std::move(name_builder.first), index); + child_builders_.emplace_back(std::move(name_builder.second)); + } + } + + void Insert(int64_t block_index, const std::shared_ptr<Field>&, + const std::shared_ptr<Array>& unconverted) override { + std::unique_lock<std::mutex> lock(mutex_); + + if (null_bitmap_chunks_.size() <= static_cast<size_t>(block_index)) { + null_bitmap_chunks_.resize(static_cast<size_t>(block_index) + 1, nullptr); + chunk_lengths_.resize(null_bitmap_chunks_.size(), -1); + child_absent_.resize(null_bitmap_chunks_.size(), std::vector<bool>(0)); + } + null_bitmap_chunks_[block_index] = unconverted->null_bitmap(); + chunk_lengths_[block_index] = unconverted->length(); + + if (unconverted->type_id() == Type::NA) { + auto maybe_buffer = AllocateBitmap(unconverted->length(), pool_); + if (maybe_buffer.ok()) { + null_bitmap_chunks_[block_index] = *std::move(maybe_buffer); + std::memset(null_bitmap_chunks_[block_index]->mutable_data(), 0, + null_bitmap_chunks_[block_index]->size()); + } else { + Status st = maybe_buffer.status(); + task_group_->Append([st] { return st; }); + } + + // absent fields will be inserted at Finish + return; + } + + const auto& struct_array = checked_cast<const StructArray&>(*unconverted); + if (promotion_graph_ == nullptr) { + // If unexpected fields are ignored or result in an error then all parsers will emit + // columns exclusively in the ordering specified in ParseOptions::explicit_schema, + // so child_builders_ is immutable and no associative lookup is necessary. + for (int i = 0; i < unconverted->num_fields(); ++i) { + child_builders_[i]->Insert(block_index, unconverted->type()->field(i), + struct_array.field(i)); + } + } else { + auto st = InsertChildren(block_index, struct_array); + if (!st.ok()) { + return task_group_->Append([st] { return st; }); + } + } + } + + Status Finish(std::shared_ptr<ChunkedArray>* out) override { + RETURN_NOT_OK(task_group_->Finish()); + + if (promotion_graph_ != nullptr) { + // insert absent child chunks + for (auto&& name_index : name_to_index_) { + auto child_builder = child_builders_[name_index.second].get(); + + RETURN_NOT_OK(child_builder->ReplaceTaskGroup(TaskGroup::MakeSerial())); + + for (size_t i = 0; i < chunk_lengths_.size(); ++i) { + if (child_absent_[i].size() > static_cast<size_t>(name_index.second) && + !child_absent_[i][name_index.second]) { + continue; + } + auto empty = std::make_shared<NullArray>(chunk_lengths_[i]); + child_builder->Insert(i, promotion_graph_->Null(name_index.first), empty); + } + } + } + + std::vector<std::shared_ptr<Field>> fields(name_to_index_.size()); + std::vector<std::shared_ptr<ChunkedArray>> child_arrays(name_to_index_.size()); + for (auto&& name_index : name_to_index_) { + auto child_builder = child_builders_[name_index.second].get(); + + std::shared_ptr<ChunkedArray> child_array; + RETURN_NOT_OK(child_builder->Finish(&child_array)); + + child_arrays[name_index.second] = child_array; + fields[name_index.second] = field(name_index.first, child_array->type()); + } + + auto type = struct_(std::move(fields)); + ArrayVector chunks(null_bitmap_chunks_.size()); + for (size_t i = 0; i < null_bitmap_chunks_.size(); ++i) { + ArrayVector child_chunks; + for (const auto& child_array : child_arrays) { + child_chunks.push_back(child_array->chunk(static_cast<int>(i))); + } + chunks[i] = std::make_shared<StructArray>(type, chunk_lengths_[i], child_chunks, + null_bitmap_chunks_[i]); + } + + *out = std::make_shared<ChunkedArray>(std::move(chunks), type); + return Status::OK(); + } + + Status ReplaceTaskGroup(const std::shared_ptr<TaskGroup>& task_group) override { + RETURN_NOT_OK(task_group_->Finish()); + for (auto&& child_builder : child_builders_) { + RETURN_NOT_OK(child_builder->ReplaceTaskGroup(task_group)); + } + task_group_ = task_group; + return Status::OK(); + } + + private: + // Insert children associatively by name; the unconverted block may have unexpected or + // differently ordered fields + // call from Insert() only, with mutex_ locked + Status InsertChildren(int64_t block_index, const StructArray& unconverted) { + const auto& fields = unconverted.type()->fields(); + + for (int i = 0; i < unconverted.num_fields(); ++i) { + auto it = name_to_index_.find(fields[i]->name()); + + if (it == name_to_index_.end()) { + // add a new field to this builder + auto type = promotion_graph_->Infer(fields[i]); + DCHECK_NE(type, nullptr) + << "invalid unconverted_field encountered in conversion: " + << fields[i]->name() << ":" << *fields[i]->type(); + + auto new_index = static_cast<int>(name_to_index_.size()); + it = name_to_index_.emplace(fields[i]->name(), new_index).first; + + std::shared_ptr<ChunkedArrayBuilder> child_builder; + RETURN_NOT_OK(MakeChunkedArrayBuilder(task_group_, pool_, promotion_graph_, type, + &child_builder)); + child_builders_.emplace_back(std::move(child_builder)); + } + + auto unconverted_field = unconverted.type()->field(i); + child_builders_[it->second]->Insert(block_index, unconverted_field, + unconverted.field(i)); + + child_absent_[block_index].resize(child_builders_.size(), true); + child_absent_[block_index][it->second] = false; + } + + return Status::OK(); + } + + std::mutex mutex_; + MemoryPool* pool_; + const PromotionGraph* promotion_graph_; + std::unordered_map<std::string, int> name_to_index_; + std::vector<std::shared_ptr<ChunkedArrayBuilder>> child_builders_; + std::vector<std::vector<bool>> child_absent_; + BufferVector null_bitmap_chunks_; + std::vector<int64_t> chunk_lengths_; +}; + +Status MakeChunkedArrayBuilder(const std::shared_ptr<TaskGroup>& task_group, + MemoryPool* pool, const PromotionGraph* promotion_graph, + const std::shared_ptr<DataType>& type, + std::shared_ptr<ChunkedArrayBuilder>* out) { + if (type->id() == Type::STRUCT) { + std::vector<std::pair<std::string, std::shared_ptr<ChunkedArrayBuilder>>> + child_builders; + for (const auto& f : type->fields()) { + std::shared_ptr<ChunkedArrayBuilder> child_builder; + RETURN_NOT_OK(MakeChunkedArrayBuilder(task_group, pool, promotion_graph, f->type(), + &child_builder)); + child_builders.emplace_back(f->name(), std::move(child_builder)); + } + *out = std::make_shared<ChunkedStructArrayBuilder>(task_group, pool, promotion_graph, + std::move(child_builders)); + return Status::OK(); + } + if (type->id() == Type::LIST) { + const auto& list_type = checked_cast<const ListType&>(*type); + std::shared_ptr<ChunkedArrayBuilder> value_builder; + RETURN_NOT_OK(MakeChunkedArrayBuilder(task_group, pool, promotion_graph, + list_type.value_type(), &value_builder)); + *out = std::make_shared<ChunkedListArrayBuilder>( + task_group, pool, std::move(value_builder), list_type.value_field()); + return Status::OK(); + } + std::shared_ptr<Converter> converter; + RETURN_NOT_OK(MakeConverter(type, pool, &converter)); + if (promotion_graph) { + *out = std::make_shared<InferringChunkedArrayBuilder>(task_group, promotion_graph, + std::move(converter)); + } else { + *out = std::make_shared<TypedChunkedArrayBuilder>(task_group, std::move(converter)); + } + return Status::OK(); +} + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/chunked_builder.h b/contrib/libs/apache/arrow/cpp/src/arrow/json/chunked_builder.h new file mode 100644 index 0000000000..93b327bf3a --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/chunked_builder.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> +#include <vector> + +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace json { + +class PromotionGraph; + +class ARROW_EXPORT ChunkedArrayBuilder { + public: + virtual ~ChunkedArrayBuilder() = default; + + /// Spawn a task that will try to convert and insert the given JSON block + virtual void Insert(int64_t block_index, + const std::shared_ptr<Field>& unconverted_field, + const std::shared_ptr<Array>& unconverted) = 0; + + /// Return the final chunked array. + /// Every chunk must be inserted before this is called! + virtual Status Finish(std::shared_ptr<ChunkedArray>* out) = 0; + + /// Finish current task group and substitute a new one + virtual Status ReplaceTaskGroup( + const std::shared_ptr<arrow::internal::TaskGroup>& task_group) = 0; + + protected: + explicit ChunkedArrayBuilder( + const std::shared_ptr<arrow::internal::TaskGroup>& task_group) + : task_group_(task_group) {} + + std::shared_ptr<arrow::internal::TaskGroup> task_group_; +}; + +/// create a chunked builder +/// +/// if unexpected fields and promotion need to be handled, promotion_graph must be +/// non-null +ARROW_EXPORT Status MakeChunkedArrayBuilder( + const std::shared_ptr<arrow::internal::TaskGroup>& task_group, MemoryPool* pool, + const PromotionGraph* promotion_graph, const std::shared_ptr<DataType>& type, + std::shared_ptr<ChunkedArrayBuilder>* out); + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/chunker.cc b/contrib/libs/apache/arrow/cpp/src/arrow/json/chunker.cc new file mode 100644 index 0000000000..b4b4d31eb9 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/chunker.cc @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/chunker.h" + +#include <algorithm> +#include <utility> +#include <vector> + +#include "arrow/json/rapidjson_defs.h" +#include "rapidjson/reader.h" + +#include "arrow/buffer.h" +#include "arrow/json/options.h" +#include "arrow/util/logging.h" +#include "arrow/util/make_unique.h" +#include "arrow/util/string_view.h" + +namespace arrow { + +using internal::make_unique; +using util::string_view; + +namespace json { + +namespace rj = arrow::rapidjson; + +static size_t ConsumeWhitespace(string_view view) { +#ifdef RAPIDJSON_SIMD + auto data = view.data(); + auto nonws_begin = rj::SkipWhitespace_SIMD(data, data + view.size()); + return nonws_begin - data; +#else + auto ws_count = view.find_first_not_of(" \t\r\n"); + if (ws_count == string_view::npos) { + return view.size(); + } else { + return ws_count; + } +#endif +} + +/// RapidJson custom stream for reading JSON stored in multiple buffers +/// http://rapidjson.org/md_doc_stream.html#CustomStream +class MultiStringStream { + public: + using Ch = char; + explicit MultiStringStream(std::vector<string_view> strings) + : strings_(std::move(strings)) { + std::reverse(strings_.begin(), strings_.end()); + } + explicit MultiStringStream(const BufferVector& buffers) : strings_(buffers.size()) { + for (size_t i = 0; i < buffers.size(); ++i) { + strings_[i] = string_view(*buffers[i]); + } + std::reverse(strings_.begin(), strings_.end()); + } + char Peek() const { + if (strings_.size() == 0) return '\0'; + return strings_.back()[0]; + } + char Take() { + if (strings_.size() == 0) return '\0'; + char taken = strings_.back()[0]; + if (strings_.back().size() == 1) { + strings_.pop_back(); + } else { + strings_.back() = strings_.back().substr(1); + } + ++index_; + return taken; + } + size_t Tell() { return index_; } + void Put(char) { ARROW_LOG(FATAL) << "not implemented"; } + void Flush() { ARROW_LOG(FATAL) << "not implemented"; } + char* PutBegin() { + ARROW_LOG(FATAL) << "not implemented"; + return nullptr; + } + size_t PutEnd(char*) { + ARROW_LOG(FATAL) << "not implemented"; + return 0; + } + + private: + size_t index_ = 0; + std::vector<string_view> strings_; +}; + +template <typename Stream> +static size_t ConsumeWholeObject(Stream&& stream) { + static constexpr unsigned parse_flags = rj::kParseIterativeFlag | + rj::kParseStopWhenDoneFlag | + rj::kParseNumbersAsStringsFlag; + rj::BaseReaderHandler<rj::UTF8<>> handler; + rj::Reader reader; + // parse a single JSON object + switch (reader.Parse<parse_flags>(stream, handler).Code()) { + case rj::kParseErrorNone: + return stream.Tell(); + case rj::kParseErrorDocumentEmpty: + return 0; + default: + // rapidjson emitted an error, the most recent object was partial + return string_view::npos; + } +} + +namespace { + +// A BoundaryFinder implementation that assumes JSON objects can contain raw newlines, +// and uses actual JSON parsing to delimit them. +class ParsingBoundaryFinder : public BoundaryFinder { + public: + Status FindFirst(string_view partial, string_view block, int64_t* out_pos) override { + // NOTE: We could bubble up JSON parse errors here, but the actual parsing + // step will detect them later anyway. + auto length = ConsumeWholeObject(MultiStringStream({partial, block})); + if (length == string_view::npos) { + *out_pos = -1; + } else { + DCHECK_GE(length, partial.size()); + DCHECK_LE(length, partial.size() + block.size()); + *out_pos = static_cast<int64_t>(length - partial.size()); + } + return Status::OK(); + } + + Status FindLast(util::string_view block, int64_t* out_pos) override { + const size_t block_length = block.size(); + size_t consumed_length = 0; + while (consumed_length < block_length) { + rj::MemoryStream ms(reinterpret_cast<const char*>(block.data()), block.size()); + using InputStream = rj::EncodedInputStream<rj::UTF8<>, rj::MemoryStream>; + auto length = ConsumeWholeObject(InputStream(ms)); + if (length == string_view::npos || length == 0) { + // found incomplete object or block is empty + break; + } + consumed_length += length; + block = block.substr(length); + } + if (consumed_length == 0) { + *out_pos = -1; + } else { + consumed_length += ConsumeWhitespace(block); + DCHECK_LE(consumed_length, block_length); + *out_pos = static_cast<int64_t>(consumed_length); + } + return Status::OK(); + } + + Status FindNth(util::string_view partial, util::string_view block, int64_t count, + int64_t* out_pos, int64_t* num_found) override { + return Status::NotImplemented("ParsingBoundaryFinder::FindNth"); + } +}; + +} // namespace + +std::unique_ptr<Chunker> MakeChunker(const ParseOptions& options) { + std::shared_ptr<BoundaryFinder> delimiter; + if (options.newlines_in_values) { + delimiter = std::make_shared<ParsingBoundaryFinder>(); + } else { + delimiter = MakeNewlineBoundaryFinder(); + } + return std::unique_ptr<Chunker>(new Chunker(std::move(delimiter))); +} + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/chunker.h b/contrib/libs/apache/arrow/cpp/src/arrow/json/chunker.h new file mode 100644 index 0000000000..9ed85126da --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/chunker.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> + +#include "arrow/util/delimiting.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace json { + +struct ParseOptions; + +ARROW_EXPORT +std::unique_ptr<Chunker> MakeChunker(const ParseOptions& options); + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/json/converter.cc new file mode 100644 index 0000000000..fe9500d40c --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/converter.cc @@ -0,0 +1,323 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/converter.h" + +#include <memory> +#include <utility> + +#include "arrow/array.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/array/builder_time.h" +#include "arrow/json/parser.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" +#include "arrow/util/string_view.h" +#include "arrow/util/value_parsing.h" + +namespace arrow { + +using internal::checked_cast; +using util::string_view; + +namespace json { + +template <typename... Args> +Status GenericConversionError(const DataType& type, Args&&... args) { + return Status::Invalid("Failed of conversion of JSON to ", type, + std::forward<Args>(args)...); +} + +namespace { + +const DictionaryArray& GetDictionaryArray(const std::shared_ptr<Array>& in) { + DCHECK_EQ(in->type_id(), Type::DICTIONARY); + auto dict_type = checked_cast<const DictionaryType*>(in->type().get()); + DCHECK_EQ(dict_type->index_type()->id(), Type::INT32); + DCHECK_EQ(dict_type->value_type()->id(), Type::STRING); + return checked_cast<const DictionaryArray&>(*in); +} + +template <typename ValidVisitor, typename NullVisitor> +Status VisitDictionaryEntries(const DictionaryArray& dict_array, + ValidVisitor&& visit_valid, NullVisitor&& visit_null) { + const StringArray& dict = checked_cast<const StringArray&>(*dict_array.dictionary()); + const Int32Array& indices = checked_cast<const Int32Array&>(*dict_array.indices()); + for (int64_t i = 0; i < indices.length(); ++i) { + if (indices.IsValid(i)) { + RETURN_NOT_OK(visit_valid(dict.GetView(indices.GetView(i)))); + } else { + RETURN_NOT_OK(visit_null()); + } + } + return Status::OK(); +} + +} // namespace + +// base class for types which accept and output non-nested types +class PrimitiveConverter : public Converter { + public: + PrimitiveConverter(MemoryPool* pool, std::shared_ptr<DataType> out_type) + : Converter(pool, out_type) {} +}; + +class NullConverter : public PrimitiveConverter { + public: + using PrimitiveConverter::PrimitiveConverter; + + Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { + if (in->type_id() != Type::NA) { + return GenericConversionError(*out_type_, " from ", *in->type()); + } + *out = in; + return Status::OK(); + } +}; + +class BooleanConverter : public PrimitiveConverter { + public: + using PrimitiveConverter::PrimitiveConverter; + + Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { + if (in->type_id() == Type::NA) { + return MakeArrayOfNull(boolean(), in->length(), pool_).Value(out); + } + if (in->type_id() != Type::BOOL) { + return GenericConversionError(*out_type_, " from ", *in->type()); + } + *out = in; + return Status::OK(); + } +}; + +template <typename T> +class NumericConverter : public PrimitiveConverter { + public: + using value_type = typename T::c_type; + + NumericConverter(MemoryPool* pool, const std::shared_ptr<DataType>& type) + : PrimitiveConverter(pool, type), numeric_type_(checked_cast<const T&>(*type)) {} + + Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { + if (in->type_id() == Type::NA) { + return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out); + } + const auto& dict_array = GetDictionaryArray(in); + + using Builder = typename TypeTraits<T>::BuilderType; + Builder builder(out_type_, pool_); + RETURN_NOT_OK(builder.Resize(dict_array.indices()->length())); + + auto visit_valid = [&](string_view repr) { + value_type value; + if (!arrow::internal::ParseValue(numeric_type_, repr.data(), repr.size(), &value)) { + return GenericConversionError(*out_type_, ", couldn't parse:", repr); + } + + builder.UnsafeAppend(value); + return Status::OK(); + }; + + auto visit_null = [&]() { + builder.UnsafeAppendNull(); + return Status::OK(); + }; + + RETURN_NOT_OK(VisitDictionaryEntries(dict_array, visit_valid, visit_null)); + return builder.Finish(out); + } + + const T& numeric_type_; +}; + +template <typename DateTimeType> +class DateTimeConverter : public PrimitiveConverter { + public: + DateTimeConverter(MemoryPool* pool, const std::shared_ptr<DataType>& type) + : PrimitiveConverter(pool, type), converter_(pool, repr_type()) {} + + Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { + if (in->type_id() == Type::NA) { + return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out); + } + + std::shared_ptr<Array> repr; + RETURN_NOT_OK(converter_.Convert(in, &repr)); + + auto out_data = repr->data()->Copy(); + out_data->type = out_type_; + *out = MakeArray(out_data); + + return Status::OK(); + } + + private: + using ReprType = typename CTypeTraits<typename DateTimeType::c_type>::ArrowType; + static std::shared_ptr<DataType> repr_type() { + return TypeTraits<ReprType>::type_singleton(); + } + NumericConverter<ReprType> converter_; +}; + +template <typename T> +class BinaryConverter : public PrimitiveConverter { + public: + using PrimitiveConverter::PrimitiveConverter; + + Status Convert(const std::shared_ptr<Array>& in, std::shared_ptr<Array>* out) override { + if (in->type_id() == Type::NA) { + return MakeArrayOfNull(out_type_, in->length(), pool_).Value(out); + } + const auto& dict_array = GetDictionaryArray(in); + + using Builder = typename TypeTraits<T>::BuilderType; + Builder builder(out_type_, pool_); + RETURN_NOT_OK(builder.Resize(dict_array.indices()->length())); + + // TODO(bkietz) this can be computed during parsing at low cost + int64_t data_length = 0; + auto visit_lengths_valid = [&](string_view value) { + data_length += value.size(); + return Status::OK(); + }; + + auto visit_lengths_null = [&]() { + // no-op + return Status::OK(); + }; + + RETURN_NOT_OK( + VisitDictionaryEntries(dict_array, visit_lengths_valid, visit_lengths_null)); + RETURN_NOT_OK(builder.ReserveData(data_length)); + + auto visit_valid = [&](string_view value) { + builder.UnsafeAppend(value); + return Status::OK(); + }; + + auto visit_null = [&]() { + builder.UnsafeAppendNull(); + return Status::OK(); + }; + + RETURN_NOT_OK(VisitDictionaryEntries(dict_array, visit_valid, visit_null)); + return builder.Finish(out); + } +}; + +Status MakeConverter(const std::shared_ptr<DataType>& out_type, MemoryPool* pool, + std::shared_ptr<Converter>* out) { + switch (out_type->id()) { +#define CONVERTER_CASE(TYPE_ID, CONVERTER_TYPE) \ + case TYPE_ID: \ + *out = std::make_shared<CONVERTER_TYPE>(pool, out_type); \ + break + CONVERTER_CASE(Type::NA, NullConverter); + CONVERTER_CASE(Type::BOOL, BooleanConverter); + CONVERTER_CASE(Type::INT8, NumericConverter<Int8Type>); + CONVERTER_CASE(Type::INT16, NumericConverter<Int16Type>); + CONVERTER_CASE(Type::INT32, NumericConverter<Int32Type>); + CONVERTER_CASE(Type::INT64, NumericConverter<Int64Type>); + CONVERTER_CASE(Type::UINT8, NumericConverter<UInt8Type>); + CONVERTER_CASE(Type::UINT16, NumericConverter<UInt16Type>); + CONVERTER_CASE(Type::UINT32, NumericConverter<UInt32Type>); + CONVERTER_CASE(Type::UINT64, NumericConverter<UInt64Type>); + CONVERTER_CASE(Type::FLOAT, NumericConverter<FloatType>); + CONVERTER_CASE(Type::DOUBLE, NumericConverter<DoubleType>); + CONVERTER_CASE(Type::TIMESTAMP, NumericConverter<TimestampType>); + CONVERTER_CASE(Type::TIME32, DateTimeConverter<Time32Type>); + CONVERTER_CASE(Type::TIME64, DateTimeConverter<Time64Type>); + CONVERTER_CASE(Type::DATE32, DateTimeConverter<Date32Type>); + CONVERTER_CASE(Type::DATE64, DateTimeConverter<Date64Type>); + CONVERTER_CASE(Type::BINARY, BinaryConverter<BinaryType>); + CONVERTER_CASE(Type::STRING, BinaryConverter<StringType>); + CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter<LargeBinaryType>); + CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter<LargeStringType>); + default: + return Status::NotImplemented("JSON conversion to ", *out_type, + " is not supported"); +#undef CONVERTER_CASE + } + return Status::OK(); +} + +const PromotionGraph* GetPromotionGraph() { + static struct : PromotionGraph { + std::shared_ptr<Field> Null(const std::string& name) const override { + return field(name, null(), true, Kind::Tag(Kind::kNull)); + } + + std::shared_ptr<DataType> Infer( + const std::shared_ptr<Field>& unexpected_field) const override { + auto kind = Kind::FromTag(unexpected_field->metadata()); + switch (kind) { + case Kind::kNull: + return null(); + + case Kind::kBoolean: + return boolean(); + + case Kind::kNumber: + return int64(); + + case Kind::kString: + return timestamp(TimeUnit::SECOND); + + case Kind::kArray: { + const auto& type = checked_cast<const ListType&>(*unexpected_field->type()); + auto value_field = type.value_field(); + return list(value_field->WithType(Infer(value_field))); + } + case Kind::kObject: { + auto fields = unexpected_field->type()->fields(); + for (auto& field : fields) { + field = field->WithType(Infer(field)); + } + return struct_(std::move(fields)); + } + default: + return nullptr; + } + } + + std::shared_ptr<DataType> Promote( + const std::shared_ptr<DataType>& failed, + const std::shared_ptr<Field>& unexpected_field) const override { + switch (failed->id()) { + case Type::NA: + return Infer(unexpected_field); + + case Type::TIMESTAMP: + return utf8(); + + case Type::INT64: + return float64(); + + default: + return nullptr; + } + } + } impl; + + return &impl; +} + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/converter.h b/contrib/libs/apache/arrow/cpp/src/arrow/json/converter.h new file mode 100644 index 0000000000..9a812dd3c3 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/converter.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> +#include <string> + +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; +class Field; +class MemoryPool; + +namespace json { + +/// \brief interface for conversion of Arrays +/// +/// Converters are not required to be correct for arbitrary input- only +/// for unconverted arrays emitted by a corresponding parser. +class ARROW_EXPORT Converter { + public: + virtual ~Converter() = default; + + /// convert an array + /// on failure, this converter may be promoted to another converter which + /// *can* convert the given input. + virtual Status Convert(const std::shared_ptr<Array>& in, + std::shared_ptr<Array>* out) = 0; + + std::shared_ptr<DataType> out_type() const { return out_type_; } + + MemoryPool* pool() { return pool_; } + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(Converter); + + Converter(MemoryPool* pool, const std::shared_ptr<DataType>& out_type) + : pool_(pool), out_type_(out_type) {} + + MemoryPool* pool_; + std::shared_ptr<DataType> out_type_; +}; + +/// \brief produce a single converter to the specified out_type +ARROW_EXPORT Status MakeConverter(const std::shared_ptr<DataType>& out_type, + MemoryPool* pool, std::shared_ptr<Converter>* out); + +class ARROW_EXPORT PromotionGraph { + public: + virtual ~PromotionGraph() = default; + + /// \brief produce a valid field which will be inferred as null + virtual std::shared_ptr<Field> Null(const std::string& name) const = 0; + + /// \brief given an unexpected field encountered during parsing, return a type to which + /// it may be convertible (may return null if none is available) + virtual std::shared_ptr<DataType> Infer( + const std::shared_ptr<Field>& unexpected_field) const = 0; + + /// \brief given a type to which conversion failed, return a promoted type to which + /// conversion may succeed (may return null if none is available) + virtual std::shared_ptr<DataType> Promote( + const std::shared_ptr<DataType>& failed, + const std::shared_ptr<Field>& unexpected_field) const = 0; + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(PromotionGraph); + PromotionGraph() = default; +}; + +ARROW_EXPORT const PromotionGraph* GetPromotionGraph(); + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/object_parser.cc b/contrib/libs/apache/arrow/cpp/src/arrow/json/object_parser.cc new file mode 100644 index 0000000000..c857cd537e --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/object_parser.cc @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/object_parser.h" +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep + +#include <rapidjson/document.h> + +namespace arrow { +namespace json { +namespace internal { + +namespace rj = arrow::rapidjson; + +class ObjectParser::Impl { + public: + Status Parse(arrow::util::string_view json) { + document_.Parse(reinterpret_cast<const rj::Document::Ch*>(json.data()), + static_cast<size_t>(json.size())); + + if (document_.HasParseError()) { + return Status::Invalid("Json parse error (offset ", document_.GetErrorOffset(), + "): ", document_.GetParseError()); + } + if (!document_.IsObject()) { + return Status::TypeError("Not a json object"); + } + return Status::OK(); + } + + Result<std::string> GetString(const char* key) const { + if (!document_.HasMember(key)) { + return Status::KeyError("Key '", key, "' does not exist"); + } + if (!document_[key].IsString()) { + return Status::TypeError("Key '", key, "' is not a string"); + } + return document_[key].GetString(); + } + + Result<bool> GetBool(const char* key) const { + if (!document_.HasMember(key)) { + return Status::KeyError("Key '", key, "' does not exist"); + } + if (!document_[key].IsBool()) { + return Status::TypeError("Key '", key, "' is not a boolean"); + } + return document_[key].GetBool(); + } + + private: + rj::Document document_; +}; + +ObjectParser::ObjectParser() : impl_(new ObjectParser::Impl()) {} + +ObjectParser::~ObjectParser() = default; + +Status ObjectParser::Parse(arrow::util::string_view json) { return impl_->Parse(json); } + +Result<std::string> ObjectParser::GetString(const char* key) const { + return impl_->GetString(key); +} + +Result<bool> ObjectParser::GetBool(const char* key) const { return impl_->GetBool(key); } + +} // namespace internal +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/object_parser.h b/contrib/libs/apache/arrow/cpp/src/arrow/json/object_parser.h new file mode 100644 index 0000000000..ef93201651 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/object_parser.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> + +#include "arrow/result.h" +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace json { +namespace internal { + +/// This class is a helper to parse a json object from a string. +/// It uses rapidjson::Document in implementation. +class ARROW_EXPORT ObjectParser { + public: + ObjectParser(); + ~ObjectParser(); + + Status Parse(arrow::util::string_view json); + + Result<std::string> GetString(const char* key) const; + Result<bool> GetBool(const char* key) const; + + private: + class Impl; + std::unique_ptr<Impl> impl_; +}; + +} // namespace internal +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/object_writer.cc b/contrib/libs/apache/arrow/cpp/src/arrow/json/object_writer.cc new file mode 100644 index 0000000000..06d09f81e9 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/object_writer.cc @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/object_writer.h" +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep + +#include <rapidjson/document.h> +#include <rapidjson/stringbuffer.h> +#include <rapidjson/writer.h> + +namespace rj = arrow::rapidjson; + +namespace arrow { +namespace json { +namespace internal { + +class ObjectWriter::Impl { + public: + Impl() : root_(rj::kObjectType) {} + + void SetString(arrow::util::string_view key, arrow::util::string_view value) { + rj::Document::AllocatorType& allocator = document_.GetAllocator(); + + rj::Value str_key(key.data(), allocator); + rj::Value str_value(value.data(), allocator); + + root_.AddMember(str_key, str_value, allocator); + } + + void SetBool(arrow::util::string_view key, bool value) { + rj::Document::AllocatorType& allocator = document_.GetAllocator(); + + rj::Value str_key(key.data(), allocator); + + root_.AddMember(str_key, value, allocator); + } + + std::string Serialize() { + rj::StringBuffer buffer; + rj::Writer<rj::StringBuffer> writer(buffer); + root_.Accept(writer); + + return buffer.GetString(); + } + + private: + rj::Document document_; + rj::Value root_; +}; + +ObjectWriter::ObjectWriter() : impl_(new ObjectWriter::Impl()) {} + +ObjectWriter::~ObjectWriter() = default; + +void ObjectWriter::SetString(arrow::util::string_view key, + arrow::util::string_view value) { + impl_->SetString(key, value); +} + +void ObjectWriter::SetBool(arrow::util::string_view key, bool value) { + impl_->SetBool(key, value); +} + +std::string ObjectWriter::Serialize() { return impl_->Serialize(); } + +} // namespace internal +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/object_writer.h b/contrib/libs/apache/arrow/cpp/src/arrow/json/object_writer.h new file mode 100644 index 0000000000..55ff0ce52b --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/object_writer.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> + +#include "arrow/util/string_view.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace json { +namespace internal { + +/// This class is a helper to serialize a json object to a string. +/// It uses rapidjson in implementation. +class ARROW_EXPORT ObjectWriter { + public: + ObjectWriter(); + ~ObjectWriter(); + + void SetString(arrow::util::string_view key, arrow::util::string_view value); + void SetBool(arrow::util::string_view key, bool value); + + std::string Serialize(); + + private: + class Impl; + std::unique_ptr<Impl> impl_; +}; + +} // namespace internal +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/options.cc b/contrib/libs/apache/arrow/cpp/src/arrow/json/options.cc new file mode 100644 index 0000000000..dc5e628b1f --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/options.cc @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/options.h" + +namespace arrow { +namespace json { + +ParseOptions ParseOptions::Defaults() { return ParseOptions(); } + +ReadOptions ReadOptions::Defaults() { return ReadOptions(); } + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/options.h b/contrib/libs/apache/arrow/cpp/src/arrow/json/options.h new file mode 100644 index 0000000000..d7edab9ced --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/options.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstdint> +#include <memory> + +#include "arrow/json/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class DataType; +class Schema; + +namespace json { + +enum class UnexpectedFieldBehavior : char { + /// Unexpected JSON fields are ignored + Ignore, + /// Unexpected JSON fields error out + Error, + /// Unexpected JSON fields are type-inferred and included in the output + InferType +}; + +struct ARROW_EXPORT ParseOptions { + // Parsing options + + /// Optional explicit schema (disables type inference on those fields) + std::shared_ptr<Schema> explicit_schema; + + /// Whether objects may be printed across multiple lines (for example pretty-printed) + /// + /// If true, parsing may be slower. + bool newlines_in_values = false; + + /// How JSON fields outside of explicit_schema (if given) are treated + UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType; + + /// Create parsing options with default values + static ParseOptions Defaults(); +}; + +struct ARROW_EXPORT ReadOptions { + // Reader options + + /// Whether to use the global CPU thread pool + bool use_threads = true; + /// Block size we request from the IO layer; also determines the size of + /// chunks when use_threads is true + int32_t block_size = 1 << 20; // 1 MB + + /// Create read options with default values + static ReadOptions Defaults(); +}; + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/parser.cc b/contrib/libs/apache/arrow/cpp/src/arrow/json/parser.cc new file mode 100644 index 0000000000..05f155645a --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/parser.cc @@ -0,0 +1,1099 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/parser.h" + +#include <functional> +#include <limits> +#include <tuple> +#include <unordered_map> +#include <utility> +#include <vector> + +#include "arrow/json/rapidjson_defs.h" +#include "rapidjson/error/en.h" +#include "rapidjson/reader.h" + +#include "arrow/array.h" +#include "arrow/array/builder_binary.h" +#include "arrow/buffer_builder.h" +#include "arrow/type.h" +#include "arrow/util/bitset_stack.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" +#include "arrow/util/make_unique.h" +#include "arrow/util/string_view.h" +#include "arrow/util/trie.h" +#include "arrow/visitor_inline.h" + +namespace arrow { + +using internal::BitsetStack; +using internal::checked_cast; +using internal::make_unique; +using util::string_view; + +namespace json { + +namespace rj = arrow::rapidjson; + +template <typename... T> +static Status ParseError(T&&... t) { + return Status::Invalid("JSON parse error: ", std::forward<T>(t)...); +} + +const std::string& Kind::Name(Kind::type kind) { + static const std::string names[] = {"null", "boolean", "number", + "string", "array", "object"}; + + return names[kind]; +} + +const std::shared_ptr<const KeyValueMetadata>& Kind::Tag(Kind::type kind) { + static const std::shared_ptr<const KeyValueMetadata> tags[] = { + key_value_metadata({{"json_kind", Kind::Name(Kind::kNull)}}), + key_value_metadata({{"json_kind", Kind::Name(Kind::kBoolean)}}), + key_value_metadata({{"json_kind", Kind::Name(Kind::kNumber)}}), + key_value_metadata({{"json_kind", Kind::Name(Kind::kString)}}), + key_value_metadata({{"json_kind", Kind::Name(Kind::kArray)}}), + key_value_metadata({{"json_kind", Kind::Name(Kind::kObject)}}), + }; + return tags[kind]; +} + +static arrow::internal::Trie MakeFromTagTrie() { + arrow::internal::TrieBuilder builder; + for (auto kind : {Kind::kNull, Kind::kBoolean, Kind::kNumber, Kind::kString, + Kind::kArray, Kind::kObject}) { + DCHECK_OK(builder.Append(Kind::Name(kind))); + } + auto name_to_kind = builder.Finish(); + DCHECK_OK(name_to_kind.Validate()); + return name_to_kind; +} + +Kind::type Kind::FromTag(const std::shared_ptr<const KeyValueMetadata>& tag) { + static arrow::internal::Trie name_to_kind = MakeFromTagTrie(); + DCHECK_NE(tag->FindKey("json_kind"), -1); + util::string_view name = tag->value(tag->FindKey("json_kind")); + DCHECK_NE(name_to_kind.Find(name), -1); + return static_cast<Kind::type>(name_to_kind.Find(name)); +} + +Status Kind::ForType(const DataType& type, Kind::type* kind) { + struct { + Status Visit(const NullType&) { return SetKind(Kind::kNull); } + Status Visit(const BooleanType&) { return SetKind(Kind::kBoolean); } + Status Visit(const NumberType&) { return SetKind(Kind::kNumber); } + Status Visit(const TimeType&) { return SetKind(Kind::kNumber); } + Status Visit(const DateType&) { return SetKind(Kind::kNumber); } + Status Visit(const BinaryType&) { return SetKind(Kind::kString); } + Status Visit(const FixedSizeBinaryType&) { return SetKind(Kind::kString); } + Status Visit(const DictionaryType& dict_type) { + return Kind::ForType(*dict_type.value_type(), kind_); + } + Status Visit(const ListType&) { return SetKind(Kind::kArray); } + Status Visit(const StructType&) { return SetKind(Kind::kObject); } + Status Visit(const DataType& not_impl) { + return Status::NotImplemented("JSON parsing of ", not_impl); + } + Status SetKind(Kind::type kind) { + *kind_ = kind; + return Status::OK(); + } + Kind::type* kind_; + } visitor = {kind}; + return VisitTypeInline(type, &visitor); +} + +/// \brief ArrayBuilder for parsed but unconverted arrays +template <Kind::type> +class RawArrayBuilder; + +/// \brief packed pointer to a RawArrayBuilder +/// +/// RawArrayBuilders are stored in HandlerBase, +/// which allows storage of their indices (uint32_t) instead of a full pointer. +/// BuilderPtr is also tagged with the json kind and nullable properties +/// so those can be accessed before dereferencing the builder. +struct BuilderPtr { + BuilderPtr() : BuilderPtr(BuilderPtr::null) {} + BuilderPtr(Kind::type k, uint32_t i, bool n) : index(i), kind(k), nullable(n) {} + + BuilderPtr(const BuilderPtr&) = default; + BuilderPtr& operator=(const BuilderPtr&) = default; + BuilderPtr(BuilderPtr&&) = default; + BuilderPtr& operator=(BuilderPtr&&) = default; + + // index of builder in its arena + // OR the length of that builder if kind == Kind::kNull + // (we don't allocate an arena for nulls since they're trivial) + uint32_t index; + Kind::type kind; + bool nullable; + + bool operator==(BuilderPtr other) const { + return kind == other.kind && index == other.index; + } + + bool operator!=(BuilderPtr other) const { return !(other == *this); } + + operator bool() const { return *this != null; } + + bool operator!() const { return *this == null; } + + // The static BuilderPtr for null type data + static const BuilderPtr null; +}; + +const BuilderPtr BuilderPtr::null(Kind::kNull, 0, true); + +template <> +class RawArrayBuilder<Kind::kBoolean> { + public: + explicit RawArrayBuilder(MemoryPool* pool) + : data_builder_(pool), null_bitmap_builder_(pool) {} + + Status Append(bool value) { + RETURN_NOT_OK(data_builder_.Append(value)); + return null_bitmap_builder_.Append(true); + } + + Status AppendNull() { + RETURN_NOT_OK(data_builder_.Append(false)); + return null_bitmap_builder_.Append(false); + } + + Status AppendNull(int64_t count) { + RETURN_NOT_OK(data_builder_.Append(count, false)); + return null_bitmap_builder_.Append(count, false); + } + + Status Finish(std::shared_ptr<Array>* out) { + auto size = length(); + auto null_count = null_bitmap_builder_.false_count(); + std::shared_ptr<Buffer> data, null_bitmap; + RETURN_NOT_OK(data_builder_.Finish(&data)); + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + *out = MakeArray(ArrayData::Make(boolean(), size, {null_bitmap, data}, null_count)); + return Status::OK(); + } + + int64_t length() { return null_bitmap_builder_.length(); } + + private: + TypedBufferBuilder<bool> data_builder_; + TypedBufferBuilder<bool> null_bitmap_builder_; +}; + +/// \brief builder for strings or unconverted numbers +/// +/// Both of these are represented in the builder as an index only; +/// the actual characters are stored in a single StringArray (into which +/// an index refers). This means building is faster since we don't do +/// allocation for string/number characters but accessing is strided. +/// +/// On completion the indices and the character storage are combined +/// into a dictionary-encoded array, which is a convenient container +/// for indices referring into another array. +class ScalarBuilder { + public: + explicit ScalarBuilder(MemoryPool* pool) + : values_length_(0), data_builder_(pool), null_bitmap_builder_(pool) {} + + Status Append(int32_t index, int32_t value_length) { + RETURN_NOT_OK(data_builder_.Append(index)); + values_length_ += value_length; + return null_bitmap_builder_.Append(true); + } + + Status AppendNull() { + RETURN_NOT_OK(data_builder_.Append(0)); + return null_bitmap_builder_.Append(false); + } + + Status AppendNull(int64_t count) { + RETURN_NOT_OK(data_builder_.Append(count, 0)); + return null_bitmap_builder_.Append(count, false); + } + + Status Finish(std::shared_ptr<Array>* out) { + auto size = length(); + auto null_count = null_bitmap_builder_.false_count(); + std::shared_ptr<Buffer> data, null_bitmap; + RETURN_NOT_OK(data_builder_.Finish(&data)); + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + *out = MakeArray(ArrayData::Make(int32(), size, {null_bitmap, data}, null_count)); + return Status::OK(); + } + + int64_t length() { return null_bitmap_builder_.length(); } + + int32_t values_length() { return values_length_; } + + private: + int32_t values_length_; + TypedBufferBuilder<int32_t> data_builder_; + TypedBufferBuilder<bool> null_bitmap_builder_; +}; + +template <> +class RawArrayBuilder<Kind::kNumber> : public ScalarBuilder { + public: + using ScalarBuilder::ScalarBuilder; +}; + +template <> +class RawArrayBuilder<Kind::kString> : public ScalarBuilder { + public: + using ScalarBuilder::ScalarBuilder; +}; + +template <> +class RawArrayBuilder<Kind::kArray> { + public: + explicit RawArrayBuilder(MemoryPool* pool) + : offset_builder_(pool), null_bitmap_builder_(pool) {} + + Status Append(int32_t child_length) { + RETURN_NOT_OK(offset_builder_.Append(offset_)); + offset_ += child_length; + return null_bitmap_builder_.Append(true); + } + + Status AppendNull() { + RETURN_NOT_OK(offset_builder_.Append(offset_)); + return null_bitmap_builder_.Append(false); + } + + Status AppendNull(int64_t count) { + RETURN_NOT_OK(offset_builder_.Append(count, offset_)); + return null_bitmap_builder_.Append(count, false); + } + + Status Finish(std::function<Status(BuilderPtr, std::shared_ptr<Array>*)> finish_child, + std::shared_ptr<Array>* out) { + RETURN_NOT_OK(offset_builder_.Append(offset_)); + auto size = length(); + auto null_count = null_bitmap_builder_.false_count(); + std::shared_ptr<Buffer> offsets, null_bitmap; + RETURN_NOT_OK(offset_builder_.Finish(&offsets)); + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + std::shared_ptr<Array> values; + RETURN_NOT_OK(finish_child(value_builder_, &values)); + auto type = list(field("item", values->type(), value_builder_.nullable, + Kind::Tag(value_builder_.kind))); + *out = MakeArray(ArrayData::Make(type, size, {null_bitmap, offsets}, {values->data()}, + null_count)); + return Status::OK(); + } + + BuilderPtr value_builder() const { return value_builder_; } + + void value_builder(BuilderPtr builder) { value_builder_ = builder; } + + int64_t length() { return null_bitmap_builder_.length(); } + + private: + BuilderPtr value_builder_ = BuilderPtr::null; + int32_t offset_ = 0; + TypedBufferBuilder<int32_t> offset_builder_; + TypedBufferBuilder<bool> null_bitmap_builder_; +}; + +template <> +class RawArrayBuilder<Kind::kObject> { + public: + explicit RawArrayBuilder(MemoryPool* pool) : null_bitmap_builder_(pool) {} + + Status Append() { return null_bitmap_builder_.Append(true); } + + Status AppendNull() { return null_bitmap_builder_.Append(false); } + + Status AppendNull(int64_t count) { return null_bitmap_builder_.Append(count, false); } + + std::string FieldName(int i) const { + for (const auto& name_index : name_to_index_) { + if (name_index.second == i) { + return name_index.first; + } + } + return ""; + } + + int GetFieldIndex(const std::string& name) const { + auto it = name_to_index_.find(name); + if (it == name_to_index_.end()) { + return -1; + } + return it->second; + } + + int AddField(std::string name, BuilderPtr builder) { + auto index = num_fields(); + field_builders_.push_back(builder); + name_to_index_.emplace(std::move(name), index); + return index; + } + + int num_fields() const { return static_cast<int>(field_builders_.size()); } + + BuilderPtr field_builder(int index) const { return field_builders_[index]; } + + void field_builder(int index, BuilderPtr builder) { field_builders_[index] = builder; } + + Status Finish(std::function<Status(BuilderPtr, std::shared_ptr<Array>*)> finish_child, + std::shared_ptr<Array>* out) { + auto size = length(); + auto null_count = null_bitmap_builder_.false_count(); + std::shared_ptr<Buffer> null_bitmap; + RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); + + std::vector<string_view> field_names(num_fields()); + for (const auto& name_index : name_to_index_) { + field_names[name_index.second] = name_index.first; + } + + std::vector<std::shared_ptr<Field>> fields(num_fields()); + std::vector<std::shared_ptr<ArrayData>> child_data(num_fields()); + for (int i = 0; i < num_fields(); ++i) { + std::shared_ptr<Array> field_values; + RETURN_NOT_OK(finish_child(field_builders_[i], &field_values)); + child_data[i] = field_values->data(); + fields[i] = field(std::string(field_names[i]), field_values->type(), + field_builders_[i].nullable, Kind::Tag(field_builders_[i].kind)); + } + + *out = MakeArray(ArrayData::Make(struct_(std::move(fields)), size, {null_bitmap}, + std::move(child_data), null_count)); + return Status::OK(); + } + + int64_t length() { return null_bitmap_builder_.length(); } + + private: + std::vector<BuilderPtr> field_builders_; + std::unordered_map<std::string, int> name_to_index_; + TypedBufferBuilder<bool> null_bitmap_builder_; +}; + +class RawBuilderSet { + public: + explicit RawBuilderSet(MemoryPool* pool) : pool_(pool) {} + + /// Retrieve a pointer to a builder from a BuilderPtr + template <Kind::type kind> + enable_if_t<kind != Kind::kNull, RawArrayBuilder<kind>*> Cast(BuilderPtr builder) { + DCHECK_EQ(builder.kind, kind); + return arena<kind>().data() + builder.index; + } + + /// construct a builder of statically defined kind + template <Kind::type kind> + Status MakeBuilder(int64_t leading_nulls, BuilderPtr* builder) { + builder->index = static_cast<uint32_t>(arena<kind>().size()); + builder->kind = kind; + builder->nullable = true; + arena<kind>().emplace_back(RawArrayBuilder<kind>(pool_)); + return Cast<kind>(*builder)->AppendNull(leading_nulls); + } + + /// construct a builder of whatever kind corresponds to a DataType + Status MakeBuilder(const DataType& t, int64_t leading_nulls, BuilderPtr* builder) { + Kind::type kind; + RETURN_NOT_OK(Kind::ForType(t, &kind)); + switch (kind) { + case Kind::kNull: + *builder = BuilderPtr(Kind::kNull, static_cast<uint32_t>(leading_nulls), true); + return Status::OK(); + + case Kind::kBoolean: + return MakeBuilder<Kind::kBoolean>(leading_nulls, builder); + + case Kind::kNumber: + return MakeBuilder<Kind::kNumber>(leading_nulls, builder); + + case Kind::kString: + return MakeBuilder<Kind::kString>(leading_nulls, builder); + + case Kind::kArray: { + RETURN_NOT_OK(MakeBuilder<Kind::kArray>(leading_nulls, builder)); + const auto& list_type = checked_cast<const ListType&>(t); + + BuilderPtr value_builder; + RETURN_NOT_OK(MakeBuilder(*list_type.value_type(), 0, &value_builder)); + value_builder.nullable = list_type.value_field()->nullable(); + + Cast<Kind::kArray>(*builder)->value_builder(value_builder); + return Status::OK(); + } + case Kind::kObject: { + RETURN_NOT_OK(MakeBuilder<Kind::kObject>(leading_nulls, builder)); + const auto& struct_type = checked_cast<const StructType&>(t); + + for (const auto& f : struct_type.fields()) { + BuilderPtr field_builder; + RETURN_NOT_OK(MakeBuilder(*f->type(), leading_nulls, &field_builder)); + field_builder.nullable = f->nullable(); + + Cast<Kind::kObject>(*builder)->AddField(f->name(), field_builder); + } + return Status::OK(); + } + default: + return Status::NotImplemented("invalid builder type"); + } + } + + /// Appending null is slightly tricky since null count is stored inline + /// for builders of Kind::kNull. Append nulls using this helper + Status AppendNull(BuilderPtr parent, int field_index, BuilderPtr builder) { + if (ARROW_PREDICT_FALSE(!builder.nullable)) { + return ParseError("a required field was null"); + } + switch (builder.kind) { + case Kind::kNull: { + DCHECK_EQ(builder, parent.kind == Kind::kArray + ? Cast<Kind::kArray>(parent)->value_builder() + : Cast<Kind::kObject>(parent)->field_builder(field_index)); + + // increment null count stored inline + builder.index += 1; + + // update the parent, since changing builder doesn't affect parent + if (parent.kind == Kind::kArray) { + Cast<Kind::kArray>(parent)->value_builder(builder); + } else { + Cast<Kind::kObject>(parent)->field_builder(field_index, builder); + } + return Status::OK(); + } + case Kind::kBoolean: + return Cast<Kind::kBoolean>(builder)->AppendNull(); + + case Kind::kNumber: + return Cast<Kind::kNumber>(builder)->AppendNull(); + + case Kind::kString: + return Cast<Kind::kString>(builder)->AppendNull(); + + case Kind::kArray: + return Cast<Kind::kArray>(builder)->AppendNull(); + + case Kind::kObject: { + auto struct_builder = Cast<Kind::kObject>(builder); + RETURN_NOT_OK(struct_builder->AppendNull()); + + for (int i = 0; i < struct_builder->num_fields(); ++i) { + auto field_builder = struct_builder->field_builder(i); + RETURN_NOT_OK(AppendNull(builder, i, field_builder)); + } + return Status::OK(); + } + default: + return Status::NotImplemented("invalid builder Kind"); + } + } + + Status Finish(const std::shared_ptr<Array>& scalar_values, BuilderPtr builder, + std::shared_ptr<Array>* out) { + auto finish_children = [this, &scalar_values](BuilderPtr child, + std::shared_ptr<Array>* out) { + return Finish(scalar_values, child, out); + }; + switch (builder.kind) { + case Kind::kNull: { + auto length = static_cast<int64_t>(builder.index); + *out = std::make_shared<NullArray>(length); + return Status::OK(); + } + case Kind::kBoolean: + return Cast<Kind::kBoolean>(builder)->Finish(out); + + case Kind::kNumber: + return FinishScalar(scalar_values, Cast<Kind::kNumber>(builder), out); + + case Kind::kString: + return FinishScalar(scalar_values, Cast<Kind::kString>(builder), out); + + case Kind::kArray: + return Cast<Kind::kArray>(builder)->Finish(std::move(finish_children), out); + + case Kind::kObject: + return Cast<Kind::kObject>(builder)->Finish(std::move(finish_children), out); + + default: + return Status::NotImplemented("invalid builder kind"); + } + } + + private: + /// finish a column of scalar values (string or number) + Status FinishScalar(const std::shared_ptr<Array>& scalar_values, ScalarBuilder* builder, + std::shared_ptr<Array>* out) { + std::shared_ptr<Array> indices; + // TODO(bkietz) embed builder->values_length() in this output somehow + RETURN_NOT_OK(builder->Finish(&indices)); + auto ty = dictionary(int32(), scalar_values->type()); + *out = std::make_shared<DictionaryArray>(ty, indices, scalar_values); + return Status::OK(); + } + + template <Kind::type kind> + std::vector<RawArrayBuilder<kind>>& arena() { + return std::get<static_cast<std::size_t>(kind)>(arenas_); + } + + MemoryPool* pool_; + std::tuple<std::tuple<>, std::vector<RawArrayBuilder<Kind::kBoolean>>, + std::vector<RawArrayBuilder<Kind::kNumber>>, + std::vector<RawArrayBuilder<Kind::kString>>, + std::vector<RawArrayBuilder<Kind::kArray>>, + std::vector<RawArrayBuilder<Kind::kObject>>> + arenas_; +}; + +/// Three implementations are provided for BlockParser, one for each +/// UnexpectedFieldBehavior. However most of the logic is identical in each +/// case, so the majority of the implementation is in this base class +class HandlerBase : public BlockParser, + public rj::BaseReaderHandler<rj::UTF8<>, HandlerBase> { + public: + explicit HandlerBase(MemoryPool* pool) + : BlockParser(pool), + builder_set_(pool), + field_index_(-1), + scalar_values_builder_(pool) {} + + /// Retrieve a pointer to a builder from a BuilderPtr + template <Kind::type kind> + enable_if_t<kind != Kind::kNull, RawArrayBuilder<kind>*> Cast(BuilderPtr builder) { + return builder_set_.Cast<kind>(builder); + } + + /// Accessor for a stored error Status + Status Error() { return status_; } + + /// \defgroup rapidjson-handler-interface functions expected by rj::Reader + /// + /// bool Key(const char* data, rj::SizeType size, ...) is omitted since + /// the behavior varies greatly between UnexpectedFieldBehaviors + /// + /// @{ + bool Null() { + status_ = builder_set_.AppendNull(builder_stack_.back(), field_index_, builder_); + return status_.ok(); + } + + bool Bool(bool value) { + constexpr auto kind = Kind::kBoolean; + if (ARROW_PREDICT_FALSE(builder_.kind != kind)) { + status_ = IllegallyChangedTo(kind); + return status_.ok(); + } + status_ = Cast<kind>(builder_)->Append(value); + return status_.ok(); + } + + bool RawNumber(const char* data, rj::SizeType size, ...) { + status_ = AppendScalar<Kind::kNumber>(builder_, string_view(data, size)); + return status_.ok(); + } + + bool String(const char* data, rj::SizeType size, ...) { + status_ = AppendScalar<Kind::kString>(builder_, string_view(data, size)); + return status_.ok(); + } + + bool StartObject() { + status_ = StartObjectImpl(); + return status_.ok(); + } + + bool EndObject(...) { + status_ = EndObjectImpl(); + return status_.ok(); + } + + bool StartArray() { + status_ = StartArrayImpl(); + return status_.ok(); + } + + bool EndArray(rj::SizeType size) { + status_ = EndArrayImpl(size); + return status_.ok(); + } + /// @} + + /// \brief Set up builders using an expected Schema + Status Initialize(const std::shared_ptr<Schema>& s) { + auto type = struct_({}); + if (s) { + type = struct_(s->fields()); + } + return builder_set_.MakeBuilder(*type, 0, &builder_); + } + + Status Finish(std::shared_ptr<Array>* parsed) override { + std::shared_ptr<Array> scalar_values; + RETURN_NOT_OK(scalar_values_builder_.Finish(&scalar_values)); + return builder_set_.Finish(scalar_values, builder_, parsed); + } + + /// \brief Emit path of current field for debugging purposes + std::string Path() { + std::string path; + for (size_t i = 0; i < builder_stack_.size(); ++i) { + auto builder = builder_stack_[i]; + if (builder.kind == Kind::kArray) { + path += "/[]"; + } else { + auto struct_builder = Cast<Kind::kObject>(builder); + auto field_index = field_index_; + if (i + 1 < field_index_stack_.size()) { + field_index = field_index_stack_[i + 1]; + } + path += "/" + struct_builder->FieldName(field_index); + } + } + return path; + } + + protected: + template <typename Handler, typename Stream> + Status DoParse(Handler& handler, Stream&& json) { + constexpr auto parse_flags = rj::kParseIterativeFlag | rj::kParseNanAndInfFlag | + rj::kParseStopWhenDoneFlag | + rj::kParseNumbersAsStringsFlag; + + rj::Reader reader; + + for (; num_rows_ < kMaxParserNumRows; ++num_rows_) { + auto ok = reader.Parse<parse_flags>(json, handler); + switch (ok.Code()) { + case rj::kParseErrorNone: + // parse the next object + continue; + case rj::kParseErrorDocumentEmpty: + // parsed all objects, finish + return Status::OK(); + case rj::kParseErrorTermination: + // handler emitted an error + return handler.Error(); + default: + // rj emitted an error + return ParseError(rj::GetParseError_En(ok.Code()), " in row ", num_rows_); + } + } + return Status::Invalid("Exceeded maximum rows"); + } + + template <typename Handler> + Status DoParse(Handler& handler, const std::shared_ptr<Buffer>& json) { + RETURN_NOT_OK(ReserveScalarStorage(json->size())); + rj::MemoryStream ms(reinterpret_cast<const char*>(json->data()), json->size()); + using InputStream = rj::EncodedInputStream<rj::UTF8<>, rj::MemoryStream>; + return DoParse(handler, InputStream(ms)); + } + + /// \defgroup handlerbase-append-methods append non-nested values + /// + /// @{ + + template <Kind::type kind> + Status AppendScalar(BuilderPtr builder, string_view scalar) { + if (ARROW_PREDICT_FALSE(builder.kind != kind)) { + return IllegallyChangedTo(kind); + } + auto index = static_cast<int32_t>(scalar_values_builder_.length()); + auto value_length = static_cast<int32_t>(scalar.size()); + RETURN_NOT_OK(Cast<kind>(builder)->Append(index, value_length)); + RETURN_NOT_OK(scalar_values_builder_.Reserve(1)); + scalar_values_builder_.UnsafeAppend(scalar); + return Status::OK(); + } + + /// @} + + Status StartObjectImpl() { + constexpr auto kind = Kind::kObject; + if (ARROW_PREDICT_FALSE(builder_.kind != kind)) { + return IllegallyChangedTo(kind); + } + auto struct_builder = Cast<kind>(builder_); + absent_fields_stack_.Push(struct_builder->num_fields(), true); + StartNested(); + return struct_builder->Append(); + } + + /// \brief helper for Key() functions + /// + /// sets the field builder with name key, or returns false if + /// there is no field with that name + bool SetFieldBuilder(string_view key, bool* duplicate_keys) { + auto parent = Cast<Kind::kObject>(builder_stack_.back()); + field_index_ = parent->GetFieldIndex(std::string(key)); + if (ARROW_PREDICT_FALSE(field_index_ == -1)) { + return false; + } + *duplicate_keys = !absent_fields_stack_[field_index_]; + if (*duplicate_keys) { + status_ = ParseError("Column(", Path(), ") was specified twice in row ", num_rows_); + return false; + } + builder_ = parent->field_builder(field_index_); + absent_fields_stack_[field_index_] = false; + return true; + } + + Status EndObjectImpl() { + auto parent = builder_stack_.back(); + + auto expected_count = absent_fields_stack_.TopSize(); + for (int i = 0; i < expected_count; ++i) { + if (!absent_fields_stack_[i]) { + continue; + } + auto field_builder = Cast<Kind::kObject>(parent)->field_builder(i); + if (ARROW_PREDICT_FALSE(!field_builder.nullable)) { + return ParseError("a required field was absent"); + } + RETURN_NOT_OK(builder_set_.AppendNull(parent, i, field_builder)); + } + absent_fields_stack_.Pop(); + EndNested(); + return Status::OK(); + } + + Status StartArrayImpl() { + constexpr auto kind = Kind::kArray; + if (ARROW_PREDICT_FALSE(builder_.kind != kind)) { + return IllegallyChangedTo(kind); + } + StartNested(); + // append to the list builder in EndArrayImpl + builder_ = Cast<kind>(builder_)->value_builder(); + return Status::OK(); + } + + Status EndArrayImpl(rj::SizeType size) { + EndNested(); + // append to list_builder here + auto list_builder = Cast<Kind::kArray>(builder_); + return list_builder->Append(size); + } + + /// helper method for StartArray and StartObject + /// adds the current builder to a stack so its + /// children can be visited and parsed. + void StartNested() { + field_index_stack_.push_back(field_index_); + field_index_ = -1; + builder_stack_.push_back(builder_); + } + + /// helper method for EndArray and EndObject + /// replaces the current builder with its parent + /// so parsing of the parent can continue + void EndNested() { + field_index_ = field_index_stack_.back(); + field_index_stack_.pop_back(); + builder_ = builder_stack_.back(); + builder_stack_.pop_back(); + } + + Status IllegallyChangedTo(Kind::type illegally_changed_to) { + return ParseError("Column(", Path(), ") changed from ", Kind::Name(builder_.kind), + " to ", Kind::Name(illegally_changed_to), " in row ", num_rows_); + } + + /// Reserve storage for scalars, these can occupy almost all of the JSON buffer + Status ReserveScalarStorage(int64_t size) override { + auto available_storage = scalar_values_builder_.value_data_capacity() - + scalar_values_builder_.value_data_length(); + if (size <= available_storage) { + return Status::OK(); + } + return scalar_values_builder_.ReserveData(size - available_storage); + } + + Status status_; + RawBuilderSet builder_set_; + BuilderPtr builder_; + // top of this stack is the parent of builder_ + std::vector<BuilderPtr> builder_stack_; + // top of this stack refers to the fields of the highest *StructBuilder* + // in builder_stack_ (list builders don't have absent fields) + BitsetStack absent_fields_stack_; + // index of builder_ within its parent + int field_index_; + // top of this stack == field_index_ + std::vector<int> field_index_stack_; + StringBuilder scalar_values_builder_; +}; + +template <UnexpectedFieldBehavior> +class Handler; + +template <> +class Handler<UnexpectedFieldBehavior::Error> : public HandlerBase { + public: + using HandlerBase::HandlerBase; + + Status Parse(const std::shared_ptr<Buffer>& json) override { + return DoParse(*this, json); + } + + /// \ingroup rapidjson-handler-interface + /// + /// if an unexpected field is encountered, emit a parse error and bail + bool Key(const char* key, rj::SizeType len, ...) { + bool duplicate_keys = false; + if (ARROW_PREDICT_FALSE(SetFieldBuilder(string_view(key, len), &duplicate_keys))) { + return true; + } + if (!duplicate_keys) { + status_ = ParseError("unexpected field"); + } + return false; + } +}; + +template <> +class Handler<UnexpectedFieldBehavior::Ignore> : public HandlerBase { + public: + using HandlerBase::HandlerBase; + + Status Parse(const std::shared_ptr<Buffer>& json) override { + return DoParse(*this, json); + } + + bool Null() { + if (Skipping()) { + return true; + } + return HandlerBase::Null(); + } + + bool Bool(bool value) { + if (Skipping()) { + return true; + } + return HandlerBase::Bool(value); + } + + bool RawNumber(const char* data, rj::SizeType size, ...) { + if (Skipping()) { + return true; + } + return HandlerBase::RawNumber(data, size); + } + + bool String(const char* data, rj::SizeType size, ...) { + if (Skipping()) { + return true; + } + return HandlerBase::String(data, size); + } + + bool StartObject() { + ++depth_; + if (Skipping()) { + return true; + } + return HandlerBase::StartObject(); + } + + /// \ingroup rapidjson-handler-interface + /// + /// if an unexpected field is encountered, skip until its value has been consumed + bool Key(const char* key, rj::SizeType len, ...) { + MaybeStopSkipping(); + if (Skipping()) { + return true; + } + bool duplicate_keys = false; + if (ARROW_PREDICT_TRUE(SetFieldBuilder(string_view(key, len), &duplicate_keys))) { + return true; + } + if (ARROW_PREDICT_FALSE(duplicate_keys)) { + return false; + } + skip_depth_ = depth_; + return true; + } + + bool EndObject(...) { + MaybeStopSkipping(); + --depth_; + if (Skipping()) { + return true; + } + return HandlerBase::EndObject(); + } + + bool StartArray() { + if (Skipping()) { + return true; + } + return HandlerBase::StartArray(); + } + + bool EndArray(rj::SizeType size) { + if (Skipping()) { + return true; + } + return HandlerBase::EndArray(size); + } + + private: + bool Skipping() { return depth_ >= skip_depth_; } + + void MaybeStopSkipping() { + if (skip_depth_ == depth_) { + skip_depth_ = std::numeric_limits<int>::max(); + } + } + + int depth_ = 0; + int skip_depth_ = std::numeric_limits<int>::max(); +}; + +template <> +class Handler<UnexpectedFieldBehavior::InferType> : public HandlerBase { + public: + using HandlerBase::HandlerBase; + + Status Parse(const std::shared_ptr<Buffer>& json) override { + return DoParse(*this, json); + } + + bool Bool(bool value) { + if (ARROW_PREDICT_FALSE(MaybePromoteFromNull<Kind::kBoolean>())) { + return false; + } + return HandlerBase::Bool(value); + } + + bool RawNumber(const char* data, rj::SizeType size, ...) { + if (ARROW_PREDICT_FALSE(MaybePromoteFromNull<Kind::kNumber>())) { + return false; + } + return HandlerBase::RawNumber(data, size); + } + + bool String(const char* data, rj::SizeType size, ...) { + if (ARROW_PREDICT_FALSE(MaybePromoteFromNull<Kind::kString>())) { + return false; + } + return HandlerBase::String(data, size); + } + + bool StartObject() { + if (ARROW_PREDICT_FALSE(MaybePromoteFromNull<Kind::kObject>())) { + return false; + } + return HandlerBase::StartObject(); + } + + /// \ingroup rapidjson-handler-interface + /// + /// If an unexpected field is encountered, add a new builder to + /// the current parent builder. It is added as a NullBuilder with + /// (parent.length - 1) leading nulls. The next value parsed + /// will probably trigger promotion of this field from null + bool Key(const char* key, rj::SizeType len, ...) { + bool duplicate_keys = false; + if (ARROW_PREDICT_TRUE(SetFieldBuilder(string_view(key, len), &duplicate_keys))) { + return true; + } + if (ARROW_PREDICT_FALSE(duplicate_keys)) { + return false; + } + auto struct_builder = Cast<Kind::kObject>(builder_stack_.back()); + auto leading_nulls = static_cast<uint32_t>(struct_builder->length() - 1); + builder_ = BuilderPtr(Kind::kNull, leading_nulls, true); + field_index_ = struct_builder->AddField(std::string(key, len), builder_); + return true; + } + + bool StartArray() { + if (ARROW_PREDICT_FALSE(MaybePromoteFromNull<Kind::kArray>())) { + return false; + } + return HandlerBase::StartArray(); + } + + private: + // return true if a terminal error was encountered + template <Kind::type kind> + bool MaybePromoteFromNull() { + if (ARROW_PREDICT_TRUE(builder_.kind != Kind::kNull)) { + return false; + } + auto parent = builder_stack_.back(); + if (parent.kind == Kind::kArray) { + auto list_builder = Cast<Kind::kArray>(parent); + DCHECK_EQ(list_builder->value_builder(), builder_); + status_ = builder_set_.MakeBuilder<kind>(builder_.index, &builder_); + if (ARROW_PREDICT_FALSE(!status_.ok())) { + return true; + } + list_builder = Cast<Kind::kArray>(parent); + list_builder->value_builder(builder_); + } else { + auto struct_builder = Cast<Kind::kObject>(parent); + DCHECK_EQ(struct_builder->field_builder(field_index_), builder_); + status_ = builder_set_.MakeBuilder<kind>(builder_.index, &builder_); + if (ARROW_PREDICT_FALSE(!status_.ok())) { + return true; + } + struct_builder = Cast<Kind::kObject>(parent); + struct_builder->field_builder(field_index_, builder_); + } + return false; + } +}; + +Status BlockParser::Make(MemoryPool* pool, const ParseOptions& options, + std::unique_ptr<BlockParser>* out) { + DCHECK(options.unexpected_field_behavior == UnexpectedFieldBehavior::InferType || + options.explicit_schema != nullptr); + + switch (options.unexpected_field_behavior) { + case UnexpectedFieldBehavior::Ignore: { + *out = make_unique<Handler<UnexpectedFieldBehavior::Ignore>>(pool); + break; + } + case UnexpectedFieldBehavior::Error: { + *out = make_unique<Handler<UnexpectedFieldBehavior::Error>>(pool); + break; + } + case UnexpectedFieldBehavior::InferType: + *out = make_unique<Handler<UnexpectedFieldBehavior::InferType>>(pool); + break; + } + return static_cast<HandlerBase&>(**out).Initialize(options.explicit_schema); +} + +Status BlockParser::Make(const ParseOptions& options, std::unique_ptr<BlockParser>* out) { + return BlockParser::Make(default_memory_pool(), options, out); +} + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/parser.h b/contrib/libs/apache/arrow/cpp/src/arrow/json/parser.h new file mode 100644 index 0000000000..4dd14e4b80 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/parser.h @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> +#include <string> + +#include "arrow/json/options.h" +#include "arrow/status.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class Buffer; +class MemoryPool; +class KeyValueMetadata; +class ResizableBuffer; + +namespace json { + +struct Kind { + enum type : uint8_t { kNull, kBoolean, kNumber, kString, kArray, kObject }; + + static const std::string& Name(Kind::type); + + static const std::shared_ptr<const KeyValueMetadata>& Tag(Kind::type); + + static Kind::type FromTag(const std::shared_ptr<const KeyValueMetadata>& tag); + + static Status ForType(const DataType& type, Kind::type* kind); +}; + +constexpr int32_t kMaxParserNumRows = 100000; + +/// \class BlockParser +/// \brief A reusable block-based parser for JSON data +/// +/// The parser takes a block of newline delimited JSON data and extracts Arrays +/// of unconverted strings which can be fed to a Converter to obtain a usable Array. +/// +/// Note that in addition to parse errors (such as malformed JSON) some conversion +/// errors are caught at parse time: +/// - A null value in non-nullable column +/// - Change in the JSON kind of a column. For example, if an explicit schema is provided +/// which stipulates that field "a" is integral, a row of {"a": "not a number"} will +/// result in an error. This also applies to fields outside an explicit schema. +class ARROW_EXPORT BlockParser { + public: + virtual ~BlockParser() = default; + + /// \brief Reserve storage for scalars parsed from a block of json + virtual Status ReserveScalarStorage(int64_t nbytes) = 0; + + /// \brief Parse a block of data + virtual Status Parse(const std::shared_ptr<Buffer>& json) = 0; + + /// \brief Extract parsed data + virtual Status Finish(std::shared_ptr<Array>* parsed) = 0; + + /// \brief Return the number of parsed rows + int32_t num_rows() const { return num_rows_; } + + /// \brief Construct a BlockParser + /// + /// \param[in] pool MemoryPool to use when constructing parsed array + /// \param[in] options ParseOptions to use when parsing JSON + /// \param[out] out constructed BlockParser + static Status Make(MemoryPool* pool, const ParseOptions& options, + std::unique_ptr<BlockParser>* out); + + static Status Make(const ParseOptions& options, std::unique_ptr<BlockParser>* out); + + protected: + ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser); + + explicit BlockParser(MemoryPool* pool) : pool_(pool) {} + + MemoryPool* pool_; + int32_t num_rows_ = 0; +}; + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/rapidjson_defs.h b/contrib/libs/apache/arrow/cpp/src/arrow/json/rapidjson_defs.h new file mode 100644 index 0000000000..9ed81d000c --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/rapidjson_defs.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Include this file before including any RapidJSON headers. + +#pragma once + +#define RAPIDJSON_HAS_STDSTRING 1 +#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 +#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 + +// rapidjson will be defined in namespace arrow::rapidjson +#define RAPIDJSON_NAMESPACE arrow::rapidjson +#define RAPIDJSON_NAMESPACE_BEGIN \ + namespace arrow { \ + namespace rapidjson { +#define RAPIDJSON_NAMESPACE_END \ + } \ + } + +// enable SIMD whitespace skipping, if available +#if defined(ARROW_HAVE_SSE4_2) +#define RAPIDJSON_SSE2 1 +#define RAPIDJSON_SSE42 1 +#endif + +#if defined(ARROW_HAVE_NEON) +#define RAPIDJSON_NEON 1 +#endif diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/reader.cc b/contrib/libs/apache/arrow/cpp/src/arrow/json/reader.cc new file mode 100644 index 0000000000..51c77fa4df --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/reader.cc @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/json/reader.h" + +#include <utility> +#include <vector> + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/io/interfaces.h" +#include "arrow/json/chunked_builder.h" +#include "arrow/json/chunker.h" +#include "arrow/json/converter.h" +#include "arrow/json/parser.h" +#include "arrow/record_batch.h" +#include "arrow/table.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/iterator.h" +#include "arrow/util/logging.h" +#include "arrow/util/string_view.h" +#include "arrow/util/task_group.h" +#include "arrow/util/thread_pool.h" + +namespace arrow { + +using util::string_view; + +using internal::checked_cast; +using internal::GetCpuThreadPool; +using internal::TaskGroup; +using internal::ThreadPool; + +namespace json { + +class TableReaderImpl : public TableReader, + public std::enable_shared_from_this<TableReaderImpl> { + public: + TableReaderImpl(MemoryPool* pool, const ReadOptions& read_options, + const ParseOptions& parse_options, + std::shared_ptr<TaskGroup> task_group) + : pool_(pool), + read_options_(read_options), + parse_options_(parse_options), + chunker_(MakeChunker(parse_options_)), + task_group_(std::move(task_group)) {} + + Status Init(std::shared_ptr<io::InputStream> input) { + ARROW_ASSIGN_OR_RAISE(auto it, + io::MakeInputStreamIterator(input, read_options_.block_size)); + return MakeReadaheadIterator(std::move(it), task_group_->parallelism()) + .Value(&block_iterator_); + } + + Result<std::shared_ptr<Table>> Read() override { + RETURN_NOT_OK(MakeBuilder()); + + ARROW_ASSIGN_OR_RAISE(auto block, block_iterator_.Next()); + if (block == nullptr) { + return Status::Invalid("Empty JSON file"); + } + + auto self = shared_from_this(); + auto empty = std::make_shared<Buffer>(""); + + int64_t block_index = 0; + std::shared_ptr<Buffer> partial = empty; + + while (block != nullptr) { + std::shared_ptr<Buffer> next_block, whole, completion, next_partial; + + ARROW_ASSIGN_OR_RAISE(next_block, block_iterator_.Next()); + + if (next_block == nullptr) { + // End of file reached => compute completion from penultimate block + RETURN_NOT_OK(chunker_->ProcessFinal(partial, block, &completion, &whole)); + } else { + std::shared_ptr<Buffer> starts_with_whole; + // Get completion of partial from previous block. + RETURN_NOT_OK(chunker_->ProcessWithPartial(partial, block, &completion, + &starts_with_whole)); + + // Get all whole objects entirely inside the current buffer + RETURN_NOT_OK(chunker_->Process(starts_with_whole, &whole, &next_partial)); + } + + // Launch parse task + task_group_->Append([self, partial, completion, whole, block_index] { + return self->ParseAndInsert(partial, completion, whole, block_index); + }); + block_index++; + + partial = next_partial; + block = next_block; + } + + std::shared_ptr<ChunkedArray> array; + RETURN_NOT_OK(builder_->Finish(&array)); + return Table::FromChunkedStructArray(array); + } + + private: + Status MakeBuilder() { + auto type = parse_options_.explicit_schema + ? struct_(parse_options_.explicit_schema->fields()) + : struct_({}); + + auto promotion_graph = + parse_options_.unexpected_field_behavior == UnexpectedFieldBehavior::InferType + ? GetPromotionGraph() + : nullptr; + + return MakeChunkedArrayBuilder(task_group_, pool_, promotion_graph, type, &builder_); + } + + Status ParseAndInsert(const std::shared_ptr<Buffer>& partial, + const std::shared_ptr<Buffer>& completion, + const std::shared_ptr<Buffer>& whole, int64_t block_index) { + std::unique_ptr<BlockParser> parser; + RETURN_NOT_OK(BlockParser::Make(pool_, parse_options_, &parser)); + RETURN_NOT_OK(parser->ReserveScalarStorage(partial->size() + completion->size() + + whole->size())); + + if (partial->size() != 0 || completion->size() != 0) { + std::shared_ptr<Buffer> straddling; + if (partial->size() == 0) { + straddling = completion; + } else if (completion->size() == 0) { + straddling = partial; + } else { + ARROW_ASSIGN_OR_RAISE(straddling, + ConcatenateBuffers({partial, completion}, pool_)); + } + RETURN_NOT_OK(parser->Parse(straddling)); + } + + if (whole->size() != 0) { + RETURN_NOT_OK(parser->Parse(whole)); + } + + std::shared_ptr<Array> parsed; + RETURN_NOT_OK(parser->Finish(&parsed)); + builder_->Insert(block_index, field("", parsed->type()), parsed); + return Status::OK(); + } + + MemoryPool* pool_; + ReadOptions read_options_; + ParseOptions parse_options_; + std::unique_ptr<Chunker> chunker_; + std::shared_ptr<TaskGroup> task_group_; + Iterator<std::shared_ptr<Buffer>> block_iterator_; + std::shared_ptr<ChunkedArrayBuilder> builder_; +}; + +Status TableReader::Read(std::shared_ptr<Table>* out) { return Read().Value(out); } + +Result<std::shared_ptr<TableReader>> TableReader::Make( + MemoryPool* pool, std::shared_ptr<io::InputStream> input, + const ReadOptions& read_options, const ParseOptions& parse_options) { + std::shared_ptr<TableReaderImpl> ptr; + if (read_options.use_threads) { + ptr = std::make_shared<TableReaderImpl>(pool, read_options, parse_options, + TaskGroup::MakeThreaded(GetCpuThreadPool())); + } else { + ptr = std::make_shared<TableReaderImpl>(pool, read_options, parse_options, + TaskGroup::MakeSerial()); + } + RETURN_NOT_OK(ptr->Init(input)); + return ptr; +} + +Status TableReader::Make(MemoryPool* pool, std::shared_ptr<io::InputStream> input, + const ReadOptions& read_options, + const ParseOptions& parse_options, + std::shared_ptr<TableReader>* out) { + return TableReader::Make(pool, input, read_options, parse_options).Value(out); +} + +Result<std::shared_ptr<RecordBatch>> ParseOne(ParseOptions options, + std::shared_ptr<Buffer> json) { + std::unique_ptr<BlockParser> parser; + RETURN_NOT_OK(BlockParser::Make(options, &parser)); + RETURN_NOT_OK(parser->Parse(json)); + std::shared_ptr<Array> parsed; + RETURN_NOT_OK(parser->Finish(&parsed)); + + auto type = + options.explicit_schema ? struct_(options.explicit_schema->fields()) : struct_({}); + auto promotion_graph = + options.unexpected_field_behavior == UnexpectedFieldBehavior::InferType + ? GetPromotionGraph() + : nullptr; + std::shared_ptr<ChunkedArrayBuilder> builder; + RETURN_NOT_OK(MakeChunkedArrayBuilder(TaskGroup::MakeSerial(), default_memory_pool(), + promotion_graph, type, &builder)); + + builder->Insert(0, field("", type), parsed); + std::shared_ptr<ChunkedArray> converted_chunked; + RETURN_NOT_OK(builder->Finish(&converted_chunked)); + const auto& converted = checked_cast<const StructArray&>(*converted_chunked->chunk(0)); + + std::vector<std::shared_ptr<Array>> columns(converted.num_fields()); + for (int i = 0; i < converted.num_fields(); ++i) { + columns[i] = converted.field(i); + } + return RecordBatch::Make(schema(converted.type()->fields()), converted.length(), + std::move(columns)); +} + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/json/reader.h new file mode 100644 index 0000000000..c40338c1e1 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/reader.h @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> + +#include "arrow/json/options.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class Table; +class RecordBatch; +class Array; +class DataType; + +namespace io { +class InputStream; +} // namespace io + +namespace json { + +/// A class that reads an entire JSON file into a Arrow Table +/// +/// The file is expected to consist of individual line-separated JSON objects +class ARROW_EXPORT TableReader { + public: + virtual ~TableReader() = default; + + /// Read the entire JSON file and convert it to a Arrow Table + virtual Result<std::shared_ptr<Table>> Read() = 0; + + ARROW_DEPRECATED("Use Result-returning version") + Status Read(std::shared_ptr<Table>* out); + + /// Create a TableReader instance + static Result<std::shared_ptr<TableReader>> Make(MemoryPool* pool, + std::shared_ptr<io::InputStream> input, + const ReadOptions&, + const ParseOptions&); + + ARROW_DEPRECATED("Use Result-returning version") + static Status Make(MemoryPool* pool, std::shared_ptr<io::InputStream> input, + const ReadOptions&, const ParseOptions&, + std::shared_ptr<TableReader>* out); +}; + +ARROW_EXPORT Result<std::shared_ptr<RecordBatch>> ParseOne(ParseOptions options, + std::shared_ptr<Buffer> json); + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/json/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/json/type_fwd.h new file mode 100644 index 0000000000..67e2e1bb40 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/json/type_fwd.h @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +namespace arrow { +namespace json { + +class TableReader; +struct ReadOptions; +struct ParseOptions; + +} // namespace json +} // namespace arrow diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitset_stack.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitset_stack.h new file mode 100644 index 0000000000..addded9494 --- /dev/null +++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitset_stack.h @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <algorithm> +#include <array> +#include <bitset> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <memory> +#include <string> +#include <type_traits> +#include <utility> +#include <vector> + +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/type_fwd.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/compare.h" +#include "arrow/util/functional.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_builder.h" +#include "arrow/util/string_view.h" +#include "arrow/util/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +/// \brief Store a stack of bitsets efficiently. The top bitset may be +/// accessed and its bits may be modified, but it may not be resized. +class BitsetStack { + public: + using reference = typename std::vector<bool>::reference; + + /// \brief push a bitset onto the stack + /// \param size number of bits in the next bitset + /// \param value initial value for bits in the pushed bitset + void Push(int size, bool value) { + offsets_.push_back(bit_count()); + bits_.resize(bit_count() + size, value); + } + + /// \brief number of bits in the bitset at the top of the stack + int TopSize() const { + if (offsets_.size() == 0) return 0; + return bit_count() - offsets_.back(); + } + + /// \brief pop a bitset off the stack + void Pop() { + bits_.resize(offsets_.back()); + offsets_.pop_back(); + } + + /// \brief get the value of a bit in the top bitset + /// \param i index of the bit to access + bool operator[](int i) const { return bits_[offsets_.back() + i]; } + + /// \brief get a mutable reference to a bit in the top bitset + /// \param i index of the bit to access + reference operator[](int i) { return bits_[offsets_.back() + i]; } + + private: + int bit_count() const { return static_cast<int>(bits_.size()); } + std::vector<bool> bits_; + std::vector<int> offsets_; +}; + +} // namespace internal +} // namespace arrow diff --git a/contrib/libs/apache/arrow/src/arrow/util/config.h b/contrib/libs/apache/arrow/src/arrow/util/config.h index 2d46017e47..4e002e3d29 100644 --- a/contrib/libs/apache/arrow/src/arrow/util/config.h +++ b/contrib/libs/apache/arrow/src/arrow/util/config.h @@ -36,11 +36,11 @@ #define ARROW_COMPUTE #define ARROW_CSV -/* #undef ARROW_DATASET */ -/* #undef ARROW_FILESYSTEM */ +#define ARROW_DATASET +#define ARROW_FILESYSTEM /* #undef ARROW_FLIGHT */ #define ARROW_IPC -/* #undef ARROW_JSON */ +#define ARROW_JSON /* #undef ARROW_S3 */ #ifdef __GNUC__ diff --git a/contrib/libs/rapidjson/include/rapidjson/document.h b/contrib/libs/rapidjson/include/rapidjson/document.h new file mode 100644 index 0000000000..a2b044c8da --- /dev/null +++ b/contrib/libs/rapidjson/include/rapidjson/document.h @@ -0,0 +1,2602 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_DOCUMENT_H_ +#define RAPIDJSON_DOCUMENT_H_ + +/*! \file document.h */ + +#include "reader.h" +#include "internal/meta.h" +#include "internal/strfunc.h" +#include "memorystream.h" +#include "encodedstream.h" +#include <new> // placement new +#include <limits> + +RAPIDJSON_DIAG_PUSH +#ifdef _MSC_VER +RAPIDJSON_DIAG_OFF(4127) // conditional expression is constant +RAPIDJSON_DIAG_OFF(4244) // conversion from kXxxFlags to 'uint16_t', possible loss of data +#ifdef _MINWINDEF_ // see: http://stackoverflow.com/questions/22744262/cant-call-stdmax-because-minwindef-h-defines-max +#ifndef NOMINMAX +#pragma push_macro("min") +#pragma push_macro("max") +#undef min +#undef max +#endif +#endif +#endif + +#ifdef __clang__ +RAPIDJSON_DIAG_OFF(padded) +RAPIDJSON_DIAG_OFF(switch-enum) +RAPIDJSON_DIAG_OFF(c++98-compat) +#endif + +#ifdef __GNUC__ +RAPIDJSON_DIAG_OFF(effc++) +#if __GNUC__ >= 6 +RAPIDJSON_DIAG_OFF(terminate) // ignore throwing RAPIDJSON_ASSERT in RAPIDJSON_NOEXCEPT functions +#endif +#endif // __GNUC__ + +#ifndef RAPIDJSON_NOMEMBERITERATORCLASS +#include <iterator> // std::iterator, std::random_access_iterator_tag +#endif + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS +#include <utility> // std::move +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +// Forward declaration. +template <typename Encoding, typename Allocator> +class GenericValue; + +template <typename Encoding, typename Allocator, typename StackAllocator> +class GenericDocument; + +//! Name-value pair in a JSON object value. +/*! + This class was internal to GenericValue. It used to be a inner struct. + But a compiler (IBM XL C/C++ for AIX) have reported to have problem with that so it moved as a namespace scope struct. + https://code.google.com/p/rapidjson/issues/detail?id=64 +*/ +template <typename Encoding, typename Allocator> +struct GenericMember { + GenericValue<Encoding, Allocator> name; //!< name of member (must be a string) + GenericValue<Encoding, Allocator> value; //!< value of member. +}; + +/////////////////////////////////////////////////////////////////////////////// +// GenericMemberIterator + +#ifndef RAPIDJSON_NOMEMBERITERATORCLASS + +//! (Constant) member iterator for a JSON object value +/*! + \tparam Const Is this a constant iterator? + \tparam Encoding Encoding of the value. (Even non-string values need to have the same encoding in a document) + \tparam Allocator Allocator type for allocating memory of object, array and string. + + This class implements a Random Access Iterator for GenericMember elements + of a GenericValue, see ISO/IEC 14882:2003(E) C++ standard, 24.1 [lib.iterator.requirements]. + + \note This iterator implementation is mainly intended to avoid implicit + conversions from iterator values to \c NULL, + e.g. from GenericValue::FindMember. + + \note Define \c RAPIDJSON_NOMEMBERITERATORCLASS to fall back to a + pointer-based implementation, if your platform doesn't provide + the C++ <iterator> header. + + \see GenericMember, GenericValue::MemberIterator, GenericValue::ConstMemberIterator + */ +template <bool Const, typename Encoding, typename Allocator> +class GenericMemberIterator + : public std::iterator<std::random_access_iterator_tag + , typename internal::MaybeAddConst<Const,GenericMember<Encoding,Allocator> >::Type> { + + friend class GenericValue<Encoding,Allocator>; + template <bool, typename, typename> friend class GenericMemberIterator; + + typedef GenericMember<Encoding,Allocator> PlainType; + typedef typename internal::MaybeAddConst<Const,PlainType>::Type ValueType; + typedef std::iterator<std::random_access_iterator_tag,ValueType> BaseType; + +public: + //! Iterator type itself + typedef GenericMemberIterator Iterator; + //! Constant iterator type + typedef GenericMemberIterator<true,Encoding,Allocator> ConstIterator; + //! Non-constant iterator type + typedef GenericMemberIterator<false,Encoding,Allocator> NonConstIterator; + + //! Pointer to (const) GenericMember + typedef typename BaseType::pointer Pointer; + //! Reference to (const) GenericMember + typedef typename BaseType::reference Reference; + //! Signed integer type (e.g. \c ptrdiff_t) + typedef typename BaseType::difference_type DifferenceType; + + //! Default constructor (singular value) + /*! Creates an iterator pointing to no element. + \note All operations, except for comparisons, are undefined on such values. + */ + GenericMemberIterator() : ptr_() {} + + //! Iterator conversions to more const + /*! + \param it (Non-const) iterator to copy from + + Allows the creation of an iterator from another GenericMemberIterator + that is "less const". Especially, creating a non-constant iterator + from a constant iterator are disabled: + \li const -> non-const (not ok) + \li const -> const (ok) + \li non-const -> const (ok) + \li non-const -> non-const (ok) + + \note If the \c Const template parameter is already \c false, this + constructor effectively defines a regular copy-constructor. + Otherwise, the copy constructor is implicitly defined. + */ + GenericMemberIterator(const NonConstIterator & it) : ptr_(it.ptr_) {} + Iterator& operator=(const NonConstIterator & it) { ptr_ = it.ptr_; return *this; } + + //! @name stepping + //@{ + Iterator& operator++(){ ++ptr_; return *this; } + Iterator& operator--(){ --ptr_; return *this; } + Iterator operator++(int){ Iterator old(*this); ++ptr_; return old; } + Iterator operator--(int){ Iterator old(*this); --ptr_; return old; } + //@} + + //! @name increment/decrement + //@{ + Iterator operator+(DifferenceType n) const { return Iterator(ptr_+n); } + Iterator operator-(DifferenceType n) const { return Iterator(ptr_-n); } + + Iterator& operator+=(DifferenceType n) { ptr_+=n; return *this; } + Iterator& operator-=(DifferenceType n) { ptr_-=n; return *this; } + //@} + + //! @name relations + //@{ + bool operator==(ConstIterator that) const { return ptr_ == that.ptr_; } + bool operator!=(ConstIterator that) const { return ptr_ != that.ptr_; } + bool operator<=(ConstIterator that) const { return ptr_ <= that.ptr_; } + bool operator>=(ConstIterator that) const { return ptr_ >= that.ptr_; } + bool operator< (ConstIterator that) const { return ptr_ < that.ptr_; } + bool operator> (ConstIterator that) const { return ptr_ > that.ptr_; } + //@} + + //! @name dereference + //@{ + Reference operator*() const { return *ptr_; } + Pointer operator->() const { return ptr_; } + Reference operator[](DifferenceType n) const { return ptr_[n]; } + //@} + + //! Distance + DifferenceType operator-(ConstIterator that) const { return ptr_-that.ptr_; } + +private: + //! Internal constructor from plain pointer + explicit GenericMemberIterator(Pointer p) : ptr_(p) {} + + Pointer ptr_; //!< raw pointer +}; + +#else // RAPIDJSON_NOMEMBERITERATORCLASS + +// class-based member iterator implementation disabled, use plain pointers + +template <bool Const, typename Encoding, typename Allocator> +struct GenericMemberIterator; + +//! non-const GenericMemberIterator +template <typename Encoding, typename Allocator> +struct GenericMemberIterator<false,Encoding,Allocator> { + //! use plain pointer as iterator type + typedef GenericMember<Encoding,Allocator>* Iterator; +}; +//! const GenericMemberIterator +template <typename Encoding, typename Allocator> +struct GenericMemberIterator<true,Encoding,Allocator> { + //! use plain const pointer as iterator type + typedef const GenericMember<Encoding,Allocator>* Iterator; +}; + +#endif // RAPIDJSON_NOMEMBERITERATORCLASS + +/////////////////////////////////////////////////////////////////////////////// +// GenericStringRef + +//! Reference to a constant string (not taking a copy) +/*! + \tparam CharType character type of the string + + This helper class is used to automatically infer constant string + references for string literals, especially from \c const \b (!) + character arrays. + + The main use is for creating JSON string values without copying the + source string via an \ref Allocator. This requires that the referenced + string pointers have a sufficient lifetime, which exceeds the lifetime + of the associated GenericValue. + + \b Example + \code + Value v("foo"); // ok, no need to copy & calculate length + const char foo[] = "foo"; + v.SetString(foo); // ok + + const char* bar = foo; + // Value x(bar); // not ok, can't rely on bar's lifetime + Value x(StringRef(bar)); // lifetime explicitly guaranteed by user + Value y(StringRef(bar, 3)); // ok, explicitly pass length + \endcode + + \see StringRef, GenericValue::SetString +*/ +template<typename CharType> +struct GenericStringRef { + typedef CharType Ch; //!< character type of the string + + //! Create string reference from \c const character array +#ifndef __clang__ // -Wdocumentation + /*! + This constructor implicitly creates a constant string reference from + a \c const character array. It has better performance than + \ref StringRef(const CharType*) by inferring the string \ref length + from the array length, and also supports strings containing null + characters. + + \tparam N length of the string, automatically inferred + + \param str Constant character array, lifetime assumed to be longer + than the use of the string in e.g. a GenericValue + + \post \ref s == str + + \note Constant complexity. + \note There is a hidden, private overload to disallow references to + non-const character arrays to be created via this constructor. + By this, e.g. function-scope arrays used to be filled via + \c snprintf are excluded from consideration. + In such cases, the referenced string should be \b copied to the + GenericValue instead. + */ +#endif + template<SizeType N> + GenericStringRef(const CharType (&str)[N]) RAPIDJSON_NOEXCEPT + : s(str), length(N-1) {} + + //! Explicitly create string reference from \c const character pointer +#ifndef __clang__ // -Wdocumentation + /*! + This constructor can be used to \b explicitly create a reference to + a constant string pointer. + + \see StringRef(const CharType*) + + \param str Constant character pointer, lifetime assumed to be longer + than the use of the string in e.g. a GenericValue + + \post \ref s == str + + \note There is a hidden, private overload to disallow references to + non-const character arrays to be created via this constructor. + By this, e.g. function-scope arrays used to be filled via + \c snprintf are excluded from consideration. + In such cases, the referenced string should be \b copied to the + GenericValue instead. + */ +#endif + explicit GenericStringRef(const CharType* str) + : s(str), length(internal::StrLen(str)){ RAPIDJSON_ASSERT(s != 0); } + + //! Create constant string reference from pointer and length +#ifndef __clang__ // -Wdocumentation + /*! \param str constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue + \param len length of the string, excluding the trailing NULL terminator + + \post \ref s == str && \ref length == len + \note Constant complexity. + */ +#endif + GenericStringRef(const CharType* str, SizeType len) + : s(str), length(len) { RAPIDJSON_ASSERT(s != 0); } + + GenericStringRef(const GenericStringRef& rhs) : s(rhs.s), length(rhs.length) {} + + //! implicit conversion to plain CharType pointer + operator const Ch *() const { return s; } + + const Ch* const s; //!< plain CharType pointer + const SizeType length; //!< length of the string (excluding the trailing NULL terminator) + +private: + //! Disallow construction from non-const array + template<SizeType N> + GenericStringRef(CharType (&str)[N]) /* = delete */; + //! Copy assignment operator not permitted - immutable type + GenericStringRef& operator=(const GenericStringRef& rhs) /* = delete */; +}; + +//! Mark a character pointer as constant string +/*! Mark a plain character pointer as a "string literal". This function + can be used to avoid copying a character string to be referenced as a + value in a JSON GenericValue object, if the string's lifetime is known + to be valid long enough. + \tparam CharType Character type of the string + \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue + \return GenericStringRef string reference object + \relatesalso GenericStringRef + + \see GenericValue::GenericValue(StringRefType), GenericValue::operator=(StringRefType), GenericValue::SetString(StringRefType), GenericValue::PushBack(StringRefType, Allocator&), GenericValue::AddMember +*/ +template<typename CharType> +inline GenericStringRef<CharType> StringRef(const CharType* str) { + return GenericStringRef<CharType>(str, internal::StrLen(str)); +} + +//! Mark a character pointer as constant string +/*! Mark a plain character pointer as a "string literal". This function + can be used to avoid copying a character string to be referenced as a + value in a JSON GenericValue object, if the string's lifetime is known + to be valid long enough. + + This version has better performance with supplied length, and also + supports string containing null characters. + + \tparam CharType character type of the string + \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue + \param length The length of source string. + \return GenericStringRef string reference object + \relatesalso GenericStringRef +*/ +template<typename CharType> +inline GenericStringRef<CharType> StringRef(const CharType* str, size_t length) { + return GenericStringRef<CharType>(str, SizeType(length)); +} + +#if RAPIDJSON_HAS_STDSTRING +//! Mark a string object as constant string +/*! Mark a string object (e.g. \c std::string) as a "string literal". + This function can be used to avoid copying a string to be referenced as a + value in a JSON GenericValue object, if the string's lifetime is known + to be valid long enough. + + \tparam CharType character type of the string + \param str Constant string, lifetime assumed to be longer than the use of the string in e.g. a GenericValue + \return GenericStringRef string reference object + \relatesalso GenericStringRef + \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING. +*/ +template<typename CharType> +inline GenericStringRef<CharType> StringRef(const std::basic_string<CharType>& str) { + return GenericStringRef<CharType>(str.data(), SizeType(str.size())); +} +#endif + +/////////////////////////////////////////////////////////////////////////////// +// GenericValue type traits +namespace internal { + +template <typename T, typename Encoding = void, typename Allocator = void> +struct IsGenericValueImpl : FalseType {}; + +// select candidates according to nested encoding and allocator types +template <typename T> struct IsGenericValueImpl<T, typename Void<typename T::EncodingType>::Type, typename Void<typename T::AllocatorType>::Type> + : IsBaseOf<GenericValue<typename T::EncodingType, typename T::AllocatorType>, T>::Type {}; + +// helper to match arbitrary GenericValue instantiations, including derived classes +template <typename T> struct IsGenericValue : IsGenericValueImpl<T>::Type {}; + +} // namespace internal + +/////////////////////////////////////////////////////////////////////////////// +// TypeHelper + +namespace internal { + +template <typename ValueType, typename T> +struct TypeHelper {}; + +template<typename ValueType> +struct TypeHelper<ValueType, bool> { + static bool Is(const ValueType& v) { return v.IsBool(); } + static bool Get(const ValueType& v) { return v.GetBool(); } + static ValueType& Set(ValueType& v, bool data) { return v.SetBool(data); } + static ValueType& Set(ValueType& v, bool data, typename ValueType::AllocatorType&) { return v.SetBool(data); } +}; + +template<typename ValueType> +struct TypeHelper<ValueType, int> { + static bool Is(const ValueType& v) { return v.IsInt(); } + static int Get(const ValueType& v) { return v.GetInt(); } + static ValueType& Set(ValueType& v, int data) { return v.SetInt(data); } + static ValueType& Set(ValueType& v, int data, typename ValueType::AllocatorType&) { return v.SetInt(data); } +}; + +template<typename ValueType> +struct TypeHelper<ValueType, unsigned> { + static bool Is(const ValueType& v) { return v.IsUint(); } + static unsigned Get(const ValueType& v) { return v.GetUint(); } + static ValueType& Set(ValueType& v, unsigned data) { return v.SetUint(data); } + static ValueType& Set(ValueType& v, unsigned data, typename ValueType::AllocatorType&) { return v.SetUint(data); } +}; + +template<typename ValueType> +struct TypeHelper<ValueType, int64_t> { + static bool Is(const ValueType& v) { return v.IsInt64(); } + static int64_t Get(const ValueType& v) { return v.GetInt64(); } + static ValueType& Set(ValueType& v, int64_t data) { return v.SetInt64(data); } + static ValueType& Set(ValueType& v, int64_t data, typename ValueType::AllocatorType&) { return v.SetInt64(data); } +}; + +template<typename ValueType> +struct TypeHelper<ValueType, uint64_t> { + static bool Is(const ValueType& v) { return v.IsUint64(); } + static uint64_t Get(const ValueType& v) { return v.GetUint64(); } + static ValueType& Set(ValueType& v, uint64_t data) { return v.SetUint64(data); } + static ValueType& Set(ValueType& v, uint64_t data, typename ValueType::AllocatorType&) { return v.SetUint64(data); } +}; + +template<typename ValueType> +struct TypeHelper<ValueType, double> { + static bool Is(const ValueType& v) { return v.IsDouble(); } + static double Get(const ValueType& v) { return v.GetDouble(); } + static ValueType& Set(ValueType& v, double data) { return v.SetDouble(data); } + static ValueType& Set(ValueType& v, double data, typename ValueType::AllocatorType&) { return v.SetDouble(data); } +}; + +template<typename ValueType> +struct TypeHelper<ValueType, float> { + static bool Is(const ValueType& v) { return v.IsFloat(); } + static float Get(const ValueType& v) { return v.GetFloat(); } + static ValueType& Set(ValueType& v, float data) { return v.SetFloat(data); } + static ValueType& Set(ValueType& v, float data, typename ValueType::AllocatorType&) { return v.SetFloat(data); } +}; + +template<typename ValueType> +struct TypeHelper<ValueType, const typename ValueType::Ch*> { + typedef const typename ValueType::Ch* StringType; + static bool Is(const ValueType& v) { return v.IsString(); } + static StringType Get(const ValueType& v) { return v.GetString(); } + static ValueType& Set(ValueType& v, const StringType data) { return v.SetString(typename ValueType::StringRefType(data)); } + static ValueType& Set(ValueType& v, const StringType data, typename ValueType::AllocatorType& a) { return v.SetString(data, a); } +}; + +#if RAPIDJSON_HAS_STDSTRING +template<typename ValueType> +struct TypeHelper<ValueType, std::basic_string<typename ValueType::Ch> > { + typedef std::basic_string<typename ValueType::Ch> StringType; + static bool Is(const ValueType& v) { return v.IsString(); } + static StringType Get(const ValueType& v) { return StringType(v.GetString(), v.GetStringLength()); } + static ValueType& Set(ValueType& v, const StringType& data, typename ValueType::AllocatorType& a) { return v.SetString(data, a); } +}; +#endif + +template<typename ValueType> +struct TypeHelper<ValueType, typename ValueType::Array> { + typedef typename ValueType::Array ArrayType; + static bool Is(const ValueType& v) { return v.IsArray(); } + static ArrayType Get(ValueType& v) { return v.GetArray(); } + static ValueType& Set(ValueType& v, ArrayType data) { return v = data; } + static ValueType& Set(ValueType& v, ArrayType data, typename ValueType::AllocatorType&) { return v = data; } +}; + +template<typename ValueType> +struct TypeHelper<ValueType, typename ValueType::ConstArray> { + typedef typename ValueType::ConstArray ArrayType; + static bool Is(const ValueType& v) { return v.IsArray(); } + static ArrayType Get(const ValueType& v) { return v.GetArray(); } +}; + +template<typename ValueType> +struct TypeHelper<ValueType, typename ValueType::Object> { + typedef typename ValueType::Object ObjectType; + static bool Is(const ValueType& v) { return v.IsObject(); } + static ObjectType Get(ValueType& v) { return v.GetObject(); } + static ValueType& Set(ValueType& v, ObjectType data) { return v = data; } + static ValueType& Set(ValueType& v, ObjectType data, typename ValueType::AllocatorType&) { v = data; } +}; + +template<typename ValueType> +struct TypeHelper<ValueType, typename ValueType::ConstObject> { + typedef typename ValueType::ConstObject ObjectType; + static bool Is(const ValueType& v) { return v.IsObject(); } + static ObjectType Get(const ValueType& v) { return v.GetObject(); } +}; + +} // namespace internal + +// Forward declarations +template <bool, typename> class GenericArray; +template <bool, typename> class GenericObject; + +/////////////////////////////////////////////////////////////////////////////// +// GenericValue + +//! Represents a JSON value. Use Value for UTF8 encoding and default allocator. +/*! + A JSON value can be one of 7 types. This class is a variant type supporting + these types. + + Use the Value if UTF8 and default allocator + + \tparam Encoding Encoding of the value. (Even non-string values need to have the same encoding in a document) + \tparam Allocator Allocator type for allocating memory of object, array and string. +*/ +template <typename Encoding, typename Allocator = MemoryPoolAllocator<> > +class GenericValue { +public: + //! Name-value pair in an object. + typedef GenericMember<Encoding, Allocator> Member; + typedef Encoding EncodingType; //!< Encoding type from template parameter. + typedef Allocator AllocatorType; //!< Allocator type from template parameter. + typedef typename Encoding::Ch Ch; //!< Character type derived from Encoding. + typedef GenericStringRef<Ch> StringRefType; //!< Reference to a constant string + typedef typename GenericMemberIterator<false,Encoding,Allocator>::Iterator MemberIterator; //!< Member iterator for iterating in object. + typedef typename GenericMemberIterator<true,Encoding,Allocator>::Iterator ConstMemberIterator; //!< Constant member iterator for iterating in object. + typedef GenericValue* ValueIterator; //!< Value iterator for iterating in array. + typedef const GenericValue* ConstValueIterator; //!< Constant value iterator for iterating in array. + typedef GenericValue<Encoding, Allocator> ValueType; //!< Value type of itself. + typedef GenericArray<false, ValueType> Array; + typedef GenericArray<true, ValueType> ConstArray; + typedef GenericObject<false, ValueType> Object; + typedef GenericObject<true, ValueType> ConstObject; + + //!@name Constructors and destructor. + //@{ + + //! Default constructor creates a null value. + GenericValue() RAPIDJSON_NOEXCEPT : data_() { data_.f.flags = kNullFlag; } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Move constructor in C++11 + GenericValue(GenericValue&& rhs) RAPIDJSON_NOEXCEPT : data_(rhs.data_) { + rhs.data_.f.flags = kNullFlag; // give up contents + } +#endif + +private: + //! Copy constructor is not permitted. + GenericValue(const GenericValue& rhs); + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Moving from a GenericDocument is not permitted. + template <typename StackAllocator> + GenericValue(GenericDocument<Encoding,Allocator,StackAllocator>&& rhs); + + //! Move assignment from a GenericDocument is not permitted. + template <typename StackAllocator> + GenericValue& operator=(GenericDocument<Encoding,Allocator,StackAllocator>&& rhs); +#endif + +public: + + //! Constructor with JSON value type. + /*! This creates a Value of specified type with default content. + \param type Type of the value. + \note Default content for number is zero. + */ + explicit GenericValue(Type type) RAPIDJSON_NOEXCEPT : data_() { + static const uint16_t defaultFlags[7] = { + kNullFlag, kFalseFlag, kTrueFlag, kObjectFlag, kArrayFlag, kShortStringFlag, + kNumberAnyFlag + }; + RAPIDJSON_ASSERT(type <= kNumberType); + data_.f.flags = defaultFlags[type]; + + // Use ShortString to store empty string. + if (type == kStringType) + data_.ss.SetLength(0); + } + + //! Explicit copy constructor (with allocator) + /*! Creates a copy of a Value by using the given Allocator + \tparam SourceAllocator allocator of \c rhs + \param rhs Value to copy from (read-only) + \param allocator Allocator for allocating copied elements and buffers. Commonly use GenericDocument::GetAllocator(). + \see CopyFrom() + */ + template <typename SourceAllocator> + GenericValue(const GenericValue<Encoding,SourceAllocator>& rhs, Allocator& allocator) { + switch (rhs.GetType()) { + case kObjectType: { + SizeType count = rhs.data_.o.size; + Member* lm = reinterpret_cast<Member*>(allocator.Malloc(count * sizeof(Member))); + const typename GenericValue<Encoding,SourceAllocator>::Member* rm = rhs.GetMembersPointer(); + for (SizeType i = 0; i < count; i++) { + new (&lm[i].name) GenericValue(rm[i].name, allocator); + new (&lm[i].value) GenericValue(rm[i].value, allocator); + } + data_.f.flags = kObjectFlag; + data_.o.size = data_.o.capacity = count; + SetMembersPointer(lm); + } + break; + case kArrayType: { + SizeType count = rhs.data_.a.size; + GenericValue* le = reinterpret_cast<GenericValue*>(allocator.Malloc(count * sizeof(GenericValue))); + const GenericValue<Encoding,SourceAllocator>* re = rhs.GetElementsPointer(); + for (SizeType i = 0; i < count; i++) + new (&le[i]) GenericValue(re[i], allocator); + data_.f.flags = kArrayFlag; + data_.a.size = data_.a.capacity = count; + SetElementsPointer(le); + } + break; + case kStringType: + if (rhs.data_.f.flags == kConstStringFlag) { + data_.f.flags = rhs.data_.f.flags; + data_ = *reinterpret_cast<const Data*>(&rhs.data_); + } + else + SetStringRaw(StringRef(rhs.GetString(), rhs.GetStringLength()), allocator); + break; + default: + data_.f.flags = rhs.data_.f.flags; + data_ = *reinterpret_cast<const Data*>(&rhs.data_); + break; + } + } + + //! Constructor for boolean value. + /*! \param b Boolean value + \note This constructor is limited to \em real boolean values and rejects + implicitly converted types like arbitrary pointers. Use an explicit cast + to \c bool, if you want to construct a boolean JSON value in such cases. + */ +#ifndef RAPIDJSON_DOXYGEN_RUNNING // hide SFINAE from Doxygen + template <typename T> + explicit GenericValue(T b, RAPIDJSON_ENABLEIF((internal::IsSame<bool, T>))) RAPIDJSON_NOEXCEPT // See #472 +#else + explicit GenericValue(bool b) RAPIDJSON_NOEXCEPT +#endif + : data_() { + // safe-guard against failing SFINAE + RAPIDJSON_STATIC_ASSERT((internal::IsSame<bool,T>::Value)); + data_.f.flags = b ? kTrueFlag : kFalseFlag; + } + + //! Constructor for int value. + explicit GenericValue(int i) RAPIDJSON_NOEXCEPT : data_() { + data_.n.i64 = i; + data_.f.flags = (i >= 0) ? (kNumberIntFlag | kUintFlag | kUint64Flag) : kNumberIntFlag; + } + + //! Constructor for unsigned value. + explicit GenericValue(unsigned u) RAPIDJSON_NOEXCEPT : data_() { + data_.n.u64 = u; + data_.f.flags = (u & 0x80000000) ? kNumberUintFlag : (kNumberUintFlag | kIntFlag | kInt64Flag); + } + + //! Constructor for int64_t value. + explicit GenericValue(int64_t i64) RAPIDJSON_NOEXCEPT : data_() { + data_.n.i64 = i64; + data_.f.flags = kNumberInt64Flag; + if (i64 >= 0) { + data_.f.flags |= kNumberUint64Flag; + if (!(static_cast<uint64_t>(i64) & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x00000000))) + data_.f.flags |= kUintFlag; + if (!(static_cast<uint64_t>(i64) & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x80000000))) + data_.f.flags |= kIntFlag; + } + else if (i64 >= static_cast<int64_t>(RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x80000000))) + data_.f.flags |= kIntFlag; + } + + //! Constructor for uint64_t value. + explicit GenericValue(uint64_t u64) RAPIDJSON_NOEXCEPT : data_() { + data_.n.u64 = u64; + data_.f.flags = kNumberUint64Flag; + if (!(u64 & RAPIDJSON_UINT64_C2(0x80000000, 0x00000000))) + data_.f.flags |= kInt64Flag; + if (!(u64 & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x00000000))) + data_.f.flags |= kUintFlag; + if (!(u64 & RAPIDJSON_UINT64_C2(0xFFFFFFFF, 0x80000000))) + data_.f.flags |= kIntFlag; + } + + //! Constructor for double value. + explicit GenericValue(double d) RAPIDJSON_NOEXCEPT : data_() { data_.n.d = d; data_.f.flags = kNumberDoubleFlag; } + + //! Constructor for float value. + explicit GenericValue(float f) RAPIDJSON_NOEXCEPT : data_() { data_.n.d = static_cast<double>(f); data_.f.flags = kNumberDoubleFlag; } + + //! Constructor for constant string (i.e. do not make a copy of string) + GenericValue(const Ch* s, SizeType length) RAPIDJSON_NOEXCEPT : data_() { SetStringRaw(StringRef(s, length)); } + + //! Constructor for constant string (i.e. do not make a copy of string) + explicit GenericValue(StringRefType s) RAPIDJSON_NOEXCEPT : data_() { SetStringRaw(s); } + + //! Constructor for copy-string (i.e. do make a copy of string) + GenericValue(const Ch* s, SizeType length, Allocator& allocator) : data_() { SetStringRaw(StringRef(s, length), allocator); } + + //! Constructor for copy-string (i.e. do make a copy of string) + GenericValue(const Ch*s, Allocator& allocator) : data_() { SetStringRaw(StringRef(s), allocator); } + +#if RAPIDJSON_HAS_STDSTRING + //! Constructor for copy-string from a string object (i.e. do make a copy of string) + /*! \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING. + */ + GenericValue(const std::basic_string<Ch>& s, Allocator& allocator) : data_() { SetStringRaw(StringRef(s), allocator); } +#endif + + //! Constructor for Array. + /*! + \param a An array obtained by \c GetArray(). + \note \c Array is always pass-by-value. + \note the source array is moved into this value and the sourec array becomes empty. + */ + GenericValue(Array a) RAPIDJSON_NOEXCEPT : data_(a.value_.data_) { + a.value_.data_ = Data(); + a.value_.data_.f.flags = kArrayFlag; + } + + //! Constructor for Object. + /*! + \param o An object obtained by \c GetObject(). + \note \c Object is always pass-by-value. + \note the source object is moved into this value and the sourec object becomes empty. + */ + GenericValue(Object o) RAPIDJSON_NOEXCEPT : data_(o.value_.data_) { + o.value_.data_ = Data(); + o.value_.data_.f.flags = kObjectFlag; + } + + //! Destructor. + /*! Need to destruct elements of array, members of object, or copy-string. + */ + ~GenericValue() { + if (Allocator::kNeedFree) { // Shortcut by Allocator's trait + switch(data_.f.flags) { + case kArrayFlag: + { + GenericValue* e = GetElementsPointer(); + for (GenericValue* v = e; v != e + data_.a.size; ++v) + v->~GenericValue(); + Allocator::Free(e); + } + break; + + case kObjectFlag: + for (MemberIterator m = MemberBegin(); m != MemberEnd(); ++m) + m->~Member(); + Allocator::Free(GetMembersPointer()); + break; + + case kCopyStringFlag: + Allocator::Free(const_cast<Ch*>(GetStringPointer())); + break; + + default: + break; // Do nothing for other types. + } + } + } + + //@} + + //!@name Assignment operators + //@{ + + //! Assignment with move semantics. + /*! \param rhs Source of the assignment. It will become a null value after assignment. + */ + GenericValue& operator=(GenericValue& rhs) RAPIDJSON_NOEXCEPT { + RAPIDJSON_ASSERT(this != &rhs); + this->~GenericValue(); + RawAssign(rhs); + return *this; + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Move assignment in C++11 + GenericValue& operator=(GenericValue&& rhs) RAPIDJSON_NOEXCEPT { + return *this = rhs.Move(); + } +#endif + + //! Assignment of constant string reference (no copy) + /*! \param str Constant string reference to be assigned + \note This overload is needed to avoid clashes with the generic primitive type assignment overload below. + \see GenericStringRef, operator=(T) + */ + GenericValue& operator=(StringRefType str) RAPIDJSON_NOEXCEPT { + GenericValue s(str); + return *this = s; + } + + //! Assignment with primitive types. + /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t + \param value The value to be assigned. + + \note The source type \c T explicitly disallows all pointer types, + especially (\c const) \ref Ch*. This helps avoiding implicitly + referencing character strings with insufficient lifetime, use + \ref SetString(const Ch*, Allocator&) (for copying) or + \ref StringRef() (to explicitly mark the pointer as constant) instead. + All other pointer types would implicitly convert to \c bool, + use \ref SetBool() instead. + */ + template <typename T> + RAPIDJSON_DISABLEIF_RETURN((internal::IsPointer<T>), (GenericValue&)) + operator=(T value) { + GenericValue v(value); + return *this = v; + } + + //! Deep-copy assignment from Value + /*! Assigns a \b copy of the Value to the current Value object + \tparam SourceAllocator Allocator type of \c rhs + \param rhs Value to copy from (read-only) + \param allocator Allocator to use for copying + */ + template <typename SourceAllocator> + GenericValue& CopyFrom(const GenericValue<Encoding, SourceAllocator>& rhs, Allocator& allocator) { + RAPIDJSON_ASSERT(static_cast<void*>(this) != static_cast<void const*>(&rhs)); + this->~GenericValue(); + new (this) GenericValue(rhs, allocator); + return *this; + } + + //! Exchange the contents of this value with those of other. + /*! + \param other Another value. + \note Constant complexity. + */ + GenericValue& Swap(GenericValue& other) RAPIDJSON_NOEXCEPT { + GenericValue temp; + temp.RawAssign(*this); + RawAssign(other); + other.RawAssign(temp); + return *this; + } + + //! free-standing swap function helper + /*! + Helper function to enable support for common swap implementation pattern based on \c std::swap: + \code + void swap(MyClass& a, MyClass& b) { + using std::swap; + swap(a.value, b.value); + // ... + } + \endcode + \see Swap() + */ + friend inline void swap(GenericValue& a, GenericValue& b) RAPIDJSON_NOEXCEPT { a.Swap(b); } + + //! Prepare Value for move semantics + /*! \return *this */ + GenericValue& Move() RAPIDJSON_NOEXCEPT { return *this; } + //@} + + //!@name Equal-to and not-equal-to operators + //@{ + //! Equal-to operator + /*! + \note If an object contains duplicated named member, comparing equality with any object is always \c false. + \note Linear time complexity (number of all values in the subtree and total lengths of all strings). + */ + template <typename SourceAllocator> + bool operator==(const GenericValue<Encoding, SourceAllocator>& rhs) const { + typedef GenericValue<Encoding, SourceAllocator> RhsType; + if (GetType() != rhs.GetType()) + return false; + + switch (GetType()) { + case kObjectType: // Warning: O(n^2) inner-loop + if (data_.o.size != rhs.data_.o.size) + return false; + for (ConstMemberIterator lhsMemberItr = MemberBegin(); lhsMemberItr != MemberEnd(); ++lhsMemberItr) { + typename RhsType::ConstMemberIterator rhsMemberItr = rhs.FindMember(lhsMemberItr->name); + if (rhsMemberItr == rhs.MemberEnd() || lhsMemberItr->value != rhsMemberItr->value) + return false; + } + return true; + + case kArrayType: + if (data_.a.size != rhs.data_.a.size) + return false; + for (SizeType i = 0; i < data_.a.size; i++) + if ((*this)[i] != rhs[i]) + return false; + return true; + + case kStringType: + return StringEqual(rhs); + + case kNumberType: + if (IsDouble() || rhs.IsDouble()) { + double a = GetDouble(); // May convert from integer to double. + double b = rhs.GetDouble(); // Ditto + return a >= b && a <= b; // Prevent -Wfloat-equal + } + else + return data_.n.u64 == rhs.data_.n.u64; + + default: + return true; + } + } + + //! Equal-to operator with const C-string pointer + bool operator==(const Ch* rhs) const { return *this == GenericValue(StringRef(rhs)); } + +#if RAPIDJSON_HAS_STDSTRING + //! Equal-to operator with string object + /*! \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING. + */ + bool operator==(const std::basic_string<Ch>& rhs) const { return *this == GenericValue(StringRef(rhs)); } +#endif + + //! Equal-to operator with primitive types + /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t, \c double, \c true, \c false + */ + template <typename T> RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>,internal::IsGenericValue<T> >), (bool)) operator==(const T& rhs) const { return *this == GenericValue(rhs); } + + //! Not-equal-to operator + /*! \return !(*this == rhs) + */ + template <typename SourceAllocator> + bool operator!=(const GenericValue<Encoding, SourceAllocator>& rhs) const { return !(*this == rhs); } + + //! Not-equal-to operator with const C-string pointer + bool operator!=(const Ch* rhs) const { return !(*this == rhs); } + + //! Not-equal-to operator with arbitrary types + /*! \return !(*this == rhs) + */ + template <typename T> RAPIDJSON_DISABLEIF_RETURN((internal::IsGenericValue<T>), (bool)) operator!=(const T& rhs) const { return !(*this == rhs); } + + //! Equal-to operator with arbitrary types (symmetric version) + /*! \return (rhs == lhs) + */ + template <typename T> friend RAPIDJSON_DISABLEIF_RETURN((internal::IsGenericValue<T>), (bool)) operator==(const T& lhs, const GenericValue& rhs) { return rhs == lhs; } + + //! Not-Equal-to operator with arbitrary types (symmetric version) + /*! \return !(rhs == lhs) + */ + template <typename T> friend RAPIDJSON_DISABLEIF_RETURN((internal::IsGenericValue<T>), (bool)) operator!=(const T& lhs, const GenericValue& rhs) { return !(rhs == lhs); } + //@} + + //!@name Type + //@{ + + Type GetType() const { return static_cast<Type>(data_.f.flags & kTypeMask); } + bool IsNull() const { return data_.f.flags == kNullFlag; } + bool IsFalse() const { return data_.f.flags == kFalseFlag; } + bool IsTrue() const { return data_.f.flags == kTrueFlag; } + bool IsBool() const { return (data_.f.flags & kBoolFlag) != 0; } + bool IsObject() const { return data_.f.flags == kObjectFlag; } + bool IsArray() const { return data_.f.flags == kArrayFlag; } + bool IsNumber() const { return (data_.f.flags & kNumberFlag) != 0; } + bool IsInt() const { return (data_.f.flags & kIntFlag) != 0; } + bool IsUint() const { return (data_.f.flags & kUintFlag) != 0; } + bool IsInt64() const { return (data_.f.flags & kInt64Flag) != 0; } + bool IsUint64() const { return (data_.f.flags & kUint64Flag) != 0; } + bool IsDouble() const { return (data_.f.flags & kDoubleFlag) != 0; } + bool IsString() const { return (data_.f.flags & kStringFlag) != 0; } + + // Checks whether a number can be losslessly converted to a double. + bool IsLosslessDouble() const { + if (!IsNumber()) return false; + if (IsUint64()) { + uint64_t u = GetUint64(); + volatile double d = static_cast<double>(u); + return (d >= 0.0) + && (d < static_cast<double>(std::numeric_limits<uint64_t>::max())) + && (u == static_cast<uint64_t>(d)); + } + if (IsInt64()) { + int64_t i = GetInt64(); + volatile double d = static_cast<double>(i); + return (d >= static_cast<double>(std::numeric_limits<int64_t>::min())) + && (d < static_cast<double>(std::numeric_limits<int64_t>::max())) + && (i == static_cast<int64_t>(d)); + } + return true; // double, int, uint are always lossless + } + + // Checks whether a number is a float (possible lossy). + bool IsFloat() const { + if ((data_.f.flags & kDoubleFlag) == 0) + return false; + double d = GetDouble(); + return d >= -3.4028234e38 && d <= 3.4028234e38; + } + // Checks whether a number can be losslessly converted to a float. + bool IsLosslessFloat() const { + if (!IsNumber()) return false; + double a = GetDouble(); + if (a < static_cast<double>(-std::numeric_limits<float>::max()) + || a > static_cast<double>(std::numeric_limits<float>::max())) + return false; + double b = static_cast<double>(static_cast<float>(a)); + return a >= b && a <= b; // Prevent -Wfloat-equal + } + + //@} + + //!@name Null + //@{ + + GenericValue& SetNull() { this->~GenericValue(); new (this) GenericValue(); return *this; } + + //@} + + //!@name Bool + //@{ + + bool GetBool() const { RAPIDJSON_ASSERT(IsBool()); return data_.f.flags == kTrueFlag; } + //!< Set boolean value + /*! \post IsBool() == true */ + GenericValue& SetBool(bool b) { this->~GenericValue(); new (this) GenericValue(b); return *this; } + + //@} + + //!@name Object + //@{ + + //! Set this value as an empty object. + /*! \post IsObject() == true */ + GenericValue& SetObject() { this->~GenericValue(); new (this) GenericValue(kObjectType); return *this; } + + //! Get the number of members in the object. + SizeType MemberCount() const { RAPIDJSON_ASSERT(IsObject()); return data_.o.size; } + + //! Check whether the object is empty. + bool ObjectEmpty() const { RAPIDJSON_ASSERT(IsObject()); return data_.o.size == 0; } + + //! Get a value from an object associated with the name. + /*! \pre IsObject() == true + \tparam T Either \c Ch or \c const \c Ch (template used for disambiguation with \ref operator[](SizeType)) + \note In version 0.1x, if the member is not found, this function returns a null value. This makes issue 7. + Since 0.2, if the name is not correct, it will assert. + If user is unsure whether a member exists, user should use HasMember() first. + A better approach is to use FindMember(). + \note Linear time complexity. + */ + template <typename T> + RAPIDJSON_DISABLEIF_RETURN((internal::NotExpr<internal::IsSame<typename internal::RemoveConst<T>::Type, Ch> >),(GenericValue&)) operator[](T* name) { + GenericValue n(StringRef(name)); + return (*this)[n]; + } + template <typename T> + RAPIDJSON_DISABLEIF_RETURN((internal::NotExpr<internal::IsSame<typename internal::RemoveConst<T>::Type, Ch> >),(const GenericValue&)) operator[](T* name) const { return const_cast<GenericValue&>(*this)[name]; } + + //! Get a value from an object associated with the name. + /*! \pre IsObject() == true + \tparam SourceAllocator Allocator of the \c name value + + \note Compared to \ref operator[](T*), this version is faster because it does not need a StrLen(). + And it can also handle strings with embedded null characters. + + \note Linear time complexity. + */ + template <typename SourceAllocator> + GenericValue& operator[](const GenericValue<Encoding, SourceAllocator>& name) { + MemberIterator member = FindMember(name); + if (member != MemberEnd()) + return member->value; + else { + RAPIDJSON_ASSERT(false); // see above note + + // This will generate -Wexit-time-destructors in clang + // static GenericValue NullValue; + // return NullValue; + + // Use static buffer and placement-new to prevent destruction + static char buffer[sizeof(GenericValue)]; + return *new (buffer) GenericValue(); + } + } + template <typename SourceAllocator> + const GenericValue& operator[](const GenericValue<Encoding, SourceAllocator>& name) const { return const_cast<GenericValue&>(*this)[name]; } + +#if RAPIDJSON_HAS_STDSTRING + //! Get a value from an object associated with name (string object). + GenericValue& operator[](const std::basic_string<Ch>& name) { return (*this)[GenericValue(StringRef(name))]; } + const GenericValue& operator[](const std::basic_string<Ch>& name) const { return (*this)[GenericValue(StringRef(name))]; } +#endif + + //! Const member iterator + /*! \pre IsObject() == true */ + ConstMemberIterator MemberBegin() const { RAPIDJSON_ASSERT(IsObject()); return ConstMemberIterator(GetMembersPointer()); } + //! Const \em past-the-end member iterator + /*! \pre IsObject() == true */ + ConstMemberIterator MemberEnd() const { RAPIDJSON_ASSERT(IsObject()); return ConstMemberIterator(GetMembersPointer() + data_.o.size); } + //! Member iterator + /*! \pre IsObject() == true */ + MemberIterator MemberBegin() { RAPIDJSON_ASSERT(IsObject()); return MemberIterator(GetMembersPointer()); } + //! \em Past-the-end member iterator + /*! \pre IsObject() == true */ + MemberIterator MemberEnd() { RAPIDJSON_ASSERT(IsObject()); return MemberIterator(GetMembersPointer() + data_.o.size); } + + //! Check whether a member exists in the object. + /*! + \param name Member name to be searched. + \pre IsObject() == true + \return Whether a member with that name exists. + \note It is better to use FindMember() directly if you need the obtain the value as well. + \note Linear time complexity. + */ + bool HasMember(const Ch* name) const { return FindMember(name) != MemberEnd(); } + +#if RAPIDJSON_HAS_STDSTRING + //! Check whether a member exists in the object with string object. + /*! + \param name Member name to be searched. + \pre IsObject() == true + \return Whether a member with that name exists. + \note It is better to use FindMember() directly if you need the obtain the value as well. + \note Linear time complexity. + */ + bool HasMember(const std::basic_string<Ch>& name) const { return FindMember(name) != MemberEnd(); } +#endif + + //! Check whether a member exists in the object with GenericValue name. + /*! + This version is faster because it does not need a StrLen(). It can also handle string with null character. + \param name Member name to be searched. + \pre IsObject() == true + \return Whether a member with that name exists. + \note It is better to use FindMember() directly if you need the obtain the value as well. + \note Linear time complexity. + */ + template <typename SourceAllocator> + bool HasMember(const GenericValue<Encoding, SourceAllocator>& name) const { return FindMember(name) != MemberEnd(); } + + //! Find member by name. + /*! + \param name Member name to be searched. + \pre IsObject() == true + \return Iterator to member, if it exists. + Otherwise returns \ref MemberEnd(). + + \note Earlier versions of Rapidjson returned a \c NULL pointer, in case + the requested member doesn't exist. For consistency with e.g. + \c std::map, this has been changed to MemberEnd() now. + \note Linear time complexity. + */ + MemberIterator FindMember(const Ch* name) { + GenericValue n(StringRef(name)); + return FindMember(n); + } + + ConstMemberIterator FindMember(const Ch* name) const { return const_cast<GenericValue&>(*this).FindMember(name); } + + //! Find member by name. + /*! + This version is faster because it does not need a StrLen(). It can also handle string with null character. + \param name Member name to be searched. + \pre IsObject() == true + \return Iterator to member, if it exists. + Otherwise returns \ref MemberEnd(). + + \note Earlier versions of Rapidjson returned a \c NULL pointer, in case + the requested member doesn't exist. For consistency with e.g. + \c std::map, this has been changed to MemberEnd() now. + \note Linear time complexity. + */ + template <typename SourceAllocator> + MemberIterator FindMember(const GenericValue<Encoding, SourceAllocator>& name) { + RAPIDJSON_ASSERT(IsObject()); + RAPIDJSON_ASSERT(name.IsString()); + MemberIterator member = MemberBegin(); + for ( ; member != MemberEnd(); ++member) + if (name.StringEqual(member->name)) + break; + return member; + } + template <typename SourceAllocator> ConstMemberIterator FindMember(const GenericValue<Encoding, SourceAllocator>& name) const { return const_cast<GenericValue&>(*this).FindMember(name); } + +#if RAPIDJSON_HAS_STDSTRING + //! Find member by string object name. + /*! + \param name Member name to be searched. + \pre IsObject() == true + \return Iterator to member, if it exists. + Otherwise returns \ref MemberEnd(). + */ + MemberIterator FindMember(const std::basic_string<Ch>& name) { return FindMember(GenericValue(StringRef(name))); } + ConstMemberIterator FindMember(const std::basic_string<Ch>& name) const { return FindMember(GenericValue(StringRef(name))); } +#endif + + //! Add a member (name-value pair) to the object. + /*! \param name A string value as name of member. + \param value Value of any type. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \note The ownership of \c name and \c value will be transferred to this object on success. + \pre IsObject() && name.IsString() + \post name.IsNull() && value.IsNull() + \note Amortized Constant time complexity. + */ + GenericValue& AddMember(GenericValue& name, GenericValue& value, Allocator& allocator) { + RAPIDJSON_ASSERT(IsObject()); + RAPIDJSON_ASSERT(name.IsString()); + + ObjectData& o = data_.o; + if (o.size >= o.capacity) { + if (o.capacity == 0) { + o.capacity = kDefaultObjectCapacity; + SetMembersPointer(reinterpret_cast<Member*>(allocator.Malloc(o.capacity * sizeof(Member)))); + } + else { + SizeType oldCapacity = o.capacity; + o.capacity += (oldCapacity + 1) / 2; // grow by factor 1.5 + SetMembersPointer(reinterpret_cast<Member*>(allocator.Realloc(GetMembersPointer(), oldCapacity * sizeof(Member), o.capacity * sizeof(Member)))); + } + } + Member* members = GetMembersPointer(); + members[o.size].name.RawAssign(name); + members[o.size].value.RawAssign(value); + o.size++; + return *this; + } + + //! Add a constant string value as member (name-value pair) to the object. + /*! \param name A string value as name of member. + \param value constant string reference as value of member. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \pre IsObject() + \note This overload is needed to avoid clashes with the generic primitive type AddMember(GenericValue&,T,Allocator&) overload below. + \note Amortized Constant time complexity. + */ + GenericValue& AddMember(GenericValue& name, StringRefType value, Allocator& allocator) { + GenericValue v(value); + return AddMember(name, v, allocator); + } + +#if RAPIDJSON_HAS_STDSTRING + //! Add a string object as member (name-value pair) to the object. + /*! \param name A string value as name of member. + \param value constant string reference as value of member. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \pre IsObject() + \note This overload is needed to avoid clashes with the generic primitive type AddMember(GenericValue&,T,Allocator&) overload below. + \note Amortized Constant time complexity. + */ + GenericValue& AddMember(GenericValue& name, std::basic_string<Ch>& value, Allocator& allocator) { + GenericValue v(value, allocator); + return AddMember(name, v, allocator); + } +#endif + + //! Add any primitive value as member (name-value pair) to the object. + /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t + \param name A string value as name of member. + \param value Value of primitive type \c T as value of member + \param allocator Allocator for reallocating memory. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \pre IsObject() + + \note The source type \c T explicitly disallows all pointer types, + especially (\c const) \ref Ch*. This helps avoiding implicitly + referencing character strings with insufficient lifetime, use + \ref AddMember(StringRefType, GenericValue&, Allocator&) or \ref + AddMember(StringRefType, StringRefType, Allocator&). + All other pointer types would implicitly convert to \c bool, + use an explicit cast instead, if needed. + \note Amortized Constant time complexity. + */ + template <typename T> + RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (GenericValue&)) + AddMember(GenericValue& name, T value, Allocator& allocator) { + GenericValue v(value); + return AddMember(name, v, allocator); + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericValue& AddMember(GenericValue&& name, GenericValue&& value, Allocator& allocator) { + return AddMember(name, value, allocator); + } + GenericValue& AddMember(GenericValue&& name, GenericValue& value, Allocator& allocator) { + return AddMember(name, value, allocator); + } + GenericValue& AddMember(GenericValue& name, GenericValue&& value, Allocator& allocator) { + return AddMember(name, value, allocator); + } + GenericValue& AddMember(StringRefType name, GenericValue&& value, Allocator& allocator) { + GenericValue n(name); + return AddMember(n, value, allocator); + } +#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS + + + //! Add a member (name-value pair) to the object. + /*! \param name A constant string reference as name of member. + \param value Value of any type. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \note The ownership of \c value will be transferred to this object on success. + \pre IsObject() + \post value.IsNull() + \note Amortized Constant time complexity. + */ + GenericValue& AddMember(StringRefType name, GenericValue& value, Allocator& allocator) { + GenericValue n(name); + return AddMember(n, value, allocator); + } + + //! Add a constant string value as member (name-value pair) to the object. + /*! \param name A constant string reference as name of member. + \param value constant string reference as value of member. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \pre IsObject() + \note This overload is needed to avoid clashes with the generic primitive type AddMember(StringRefType,T,Allocator&) overload below. + \note Amortized Constant time complexity. + */ + GenericValue& AddMember(StringRefType name, StringRefType value, Allocator& allocator) { + GenericValue v(value); + return AddMember(name, v, allocator); + } + + //! Add any primitive value as member (name-value pair) to the object. + /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t + \param name A constant string reference as name of member. + \param value Value of primitive type \c T as value of member + \param allocator Allocator for reallocating memory. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \pre IsObject() + + \note The source type \c T explicitly disallows all pointer types, + especially (\c const) \ref Ch*. This helps avoiding implicitly + referencing character strings with insufficient lifetime, use + \ref AddMember(StringRefType, GenericValue&, Allocator&) or \ref + AddMember(StringRefType, StringRefType, Allocator&). + All other pointer types would implicitly convert to \c bool, + use an explicit cast instead, if needed. + \note Amortized Constant time complexity. + */ + template <typename T> + RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (GenericValue&)) + AddMember(StringRefType name, T value, Allocator& allocator) { + GenericValue n(name); + return AddMember(n, value, allocator); + } + + //! Remove all members in the object. + /*! This function do not deallocate memory in the object, i.e. the capacity is unchanged. + \note Linear time complexity. + */ + void RemoveAllMembers() { + RAPIDJSON_ASSERT(IsObject()); + for (MemberIterator m = MemberBegin(); m != MemberEnd(); ++m) + m->~Member(); + data_.o.size = 0; + } + + //! Remove a member in object by its name. + /*! \param name Name of member to be removed. + \return Whether the member existed. + \note This function may reorder the object members. Use \ref + EraseMember(ConstMemberIterator) if you need to preserve the + relative order of the remaining members. + \note Linear time complexity. + */ + bool RemoveMember(const Ch* name) { + GenericValue n(StringRef(name)); + return RemoveMember(n); + } + +#if RAPIDJSON_HAS_STDSTRING + bool RemoveMember(const std::basic_string<Ch>& name) { return RemoveMember(GenericValue(StringRef(name))); } +#endif + + template <typename SourceAllocator> + bool RemoveMember(const GenericValue<Encoding, SourceAllocator>& name) { + MemberIterator m = FindMember(name); + if (m != MemberEnd()) { + RemoveMember(m); + return true; + } + else + return false; + } + + //! Remove a member in object by iterator. + /*! \param m member iterator (obtained by FindMember() or MemberBegin()). + \return the new iterator after removal. + \note This function may reorder the object members. Use \ref + EraseMember(ConstMemberIterator) if you need to preserve the + relative order of the remaining members. + \note Constant time complexity. + */ + MemberIterator RemoveMember(MemberIterator m) { + RAPIDJSON_ASSERT(IsObject()); + RAPIDJSON_ASSERT(data_.o.size > 0); + RAPIDJSON_ASSERT(GetMembersPointer() != 0); + RAPIDJSON_ASSERT(m >= MemberBegin() && m < MemberEnd()); + + MemberIterator last(GetMembersPointer() + (data_.o.size - 1)); + if (data_.o.size > 1 && m != last) + *m = *last; // Move the last one to this place + else + m->~Member(); // Only one left, just destroy + --data_.o.size; + return m; + } + + //! Remove a member from an object by iterator. + /*! \param pos iterator to the member to remove + \pre IsObject() == true && \ref MemberBegin() <= \c pos < \ref MemberEnd() + \return Iterator following the removed element. + If the iterator \c pos refers to the last element, the \ref MemberEnd() iterator is returned. + \note This function preserves the relative order of the remaining object + members. If you do not need this, use the more efficient \ref RemoveMember(MemberIterator). + \note Linear time complexity. + */ + MemberIterator EraseMember(ConstMemberIterator pos) { + return EraseMember(pos, pos +1); + } + + //! Remove members in the range [first, last) from an object. + /*! \param first iterator to the first member to remove + \param last iterator following the last member to remove + \pre IsObject() == true && \ref MemberBegin() <= \c first <= \c last <= \ref MemberEnd() + \return Iterator following the last removed element. + \note This function preserves the relative order of the remaining object + members. + \note Linear time complexity. + */ + MemberIterator EraseMember(ConstMemberIterator first, ConstMemberIterator last) { + RAPIDJSON_ASSERT(IsObject()); + RAPIDJSON_ASSERT(data_.o.size > 0); + RAPIDJSON_ASSERT(GetMembersPointer() != 0); + RAPIDJSON_ASSERT(first >= MemberBegin()); + RAPIDJSON_ASSERT(first <= last); + RAPIDJSON_ASSERT(last <= MemberEnd()); + + MemberIterator pos = MemberBegin() + (first - MemberBegin()); + for (MemberIterator itr = pos; itr != last; ++itr) + itr->~Member(); + std::memmove(&*pos, &*last, static_cast<size_t>(MemberEnd() - last) * sizeof(Member)); + data_.o.size -= static_cast<SizeType>(last - first); + return pos; + } + + //! Erase a member in object by its name. + /*! \param name Name of member to be removed. + \return Whether the member existed. + \note Linear time complexity. + */ + bool EraseMember(const Ch* name) { + GenericValue n(StringRef(name)); + return EraseMember(n); + } + +#if RAPIDJSON_HAS_STDSTRING + bool EraseMember(const std::basic_string<Ch>& name) { return EraseMember(GenericValue(StringRef(name))); } +#endif + + template <typename SourceAllocator> + bool EraseMember(const GenericValue<Encoding, SourceAllocator>& name) { + MemberIterator m = FindMember(name); + if (m != MemberEnd()) { + EraseMember(m); + return true; + } + else + return false; + } + + Object GetObject() { RAPIDJSON_ASSERT(IsObject()); return Object(*this); } + ConstObject GetObject() const { RAPIDJSON_ASSERT(IsObject()); return ConstObject(*this); } + + //@} + + //!@name Array + //@{ + + //! Set this value as an empty array. + /*! \post IsArray == true */ + GenericValue& SetArray() { this->~GenericValue(); new (this) GenericValue(kArrayType); return *this; } + + //! Get the number of elements in array. + SizeType Size() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.size; } + + //! Get the capacity of array. + SizeType Capacity() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.capacity; } + + //! Check whether the array is empty. + bool Empty() const { RAPIDJSON_ASSERT(IsArray()); return data_.a.size == 0; } + + //! Remove all elements in the array. + /*! This function do not deallocate memory in the array, i.e. the capacity is unchanged. + \note Linear time complexity. + */ + void Clear() { + RAPIDJSON_ASSERT(IsArray()); + GenericValue* e = GetElementsPointer(); + for (GenericValue* v = e; v != e + data_.a.size; ++v) + v->~GenericValue(); + data_.a.size = 0; + } + + //! Get an element from array by index. + /*! \pre IsArray() == true + \param index Zero-based index of element. + \see operator[](T*) + */ + GenericValue& operator[](SizeType index) { + RAPIDJSON_ASSERT(IsArray()); + RAPIDJSON_ASSERT(index < data_.a.size); + return GetElementsPointer()[index]; + } + const GenericValue& operator[](SizeType index) const { return const_cast<GenericValue&>(*this)[index]; } + + //! Element iterator + /*! \pre IsArray() == true */ + ValueIterator Begin() { RAPIDJSON_ASSERT(IsArray()); return GetElementsPointer(); } + //! \em Past-the-end element iterator + /*! \pre IsArray() == true */ + ValueIterator End() { RAPIDJSON_ASSERT(IsArray()); return GetElementsPointer() + data_.a.size; } + //! Constant element iterator + /*! \pre IsArray() == true */ + ConstValueIterator Begin() const { return const_cast<GenericValue&>(*this).Begin(); } + //! Constant \em past-the-end element iterator + /*! \pre IsArray() == true */ + ConstValueIterator End() const { return const_cast<GenericValue&>(*this).End(); } + + //! Request the array to have enough capacity to store elements. + /*! \param newCapacity The capacity that the array at least need to have. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \note Linear time complexity. + */ + GenericValue& Reserve(SizeType newCapacity, Allocator &allocator) { + RAPIDJSON_ASSERT(IsArray()); + if (newCapacity > data_.a.capacity) { + SetElementsPointer(reinterpret_cast<GenericValue*>(allocator.Realloc(GetElementsPointer(), data_.a.capacity * sizeof(GenericValue), newCapacity * sizeof(GenericValue)))); + data_.a.capacity = newCapacity; + } + return *this; + } + + //! Append a GenericValue at the end of the array. + /*! \param value Value to be appended. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \pre IsArray() == true + \post value.IsNull() == true + \return The value itself for fluent API. + \note The ownership of \c value will be transferred to this array on success. + \note If the number of elements to be appended is known, calls Reserve() once first may be more efficient. + \note Amortized constant time complexity. + */ + GenericValue& PushBack(GenericValue& value, Allocator& allocator) { + RAPIDJSON_ASSERT(IsArray()); + if (data_.a.size >= data_.a.capacity) + Reserve(data_.a.capacity == 0 ? kDefaultArrayCapacity : (data_.a.capacity + (data_.a.capacity + 1) / 2), allocator); + GetElementsPointer()[data_.a.size++].RawAssign(value); + return *this; + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericValue& PushBack(GenericValue&& value, Allocator& allocator) { + return PushBack(value, allocator); + } +#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS + + //! Append a constant string reference at the end of the array. + /*! \param value Constant string reference to be appended. + \param allocator Allocator for reallocating memory. It must be the same one used previously. Commonly use GenericDocument::GetAllocator(). + \pre IsArray() == true + \return The value itself for fluent API. + \note If the number of elements to be appended is known, calls Reserve() once first may be more efficient. + \note Amortized constant time complexity. + \see GenericStringRef + */ + GenericValue& PushBack(StringRefType value, Allocator& allocator) { + return (*this).template PushBack<StringRefType>(value, allocator); + } + + //! Append a primitive value at the end of the array. + /*! \tparam T Either \ref Type, \c int, \c unsigned, \c int64_t, \c uint64_t + \param value Value of primitive type T to be appended. + \param allocator Allocator for reallocating memory. It must be the same one as used before. Commonly use GenericDocument::GetAllocator(). + \pre IsArray() == true + \return The value itself for fluent API. + \note If the number of elements to be appended is known, calls Reserve() once first may be more efficient. + + \note The source type \c T explicitly disallows all pointer types, + especially (\c const) \ref Ch*. This helps avoiding implicitly + referencing character strings with insufficient lifetime, use + \ref PushBack(GenericValue&, Allocator&) or \ref + PushBack(StringRefType, Allocator&). + All other pointer types would implicitly convert to \c bool, + use an explicit cast instead, if needed. + \note Amortized constant time complexity. + */ + template <typename T> + RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (GenericValue&)) + PushBack(T value, Allocator& allocator) { + GenericValue v(value); + return PushBack(v, allocator); + } + + //! Remove the last element in the array. + /*! + \note Constant time complexity. + */ + GenericValue& PopBack() { + RAPIDJSON_ASSERT(IsArray()); + RAPIDJSON_ASSERT(!Empty()); + GetElementsPointer()[--data_.a.size].~GenericValue(); + return *this; + } + + //! Remove an element of array by iterator. + /*! + \param pos iterator to the element to remove + \pre IsArray() == true && \ref Begin() <= \c pos < \ref End() + \return Iterator following the removed element. If the iterator pos refers to the last element, the End() iterator is returned. + \note Linear time complexity. + */ + ValueIterator Erase(ConstValueIterator pos) { + return Erase(pos, pos + 1); + } + + //! Remove elements in the range [first, last) of the array. + /*! + \param first iterator to the first element to remove + \param last iterator following the last element to remove + \pre IsArray() == true && \ref Begin() <= \c first <= \c last <= \ref End() + \return Iterator following the last removed element. + \note Linear time complexity. + */ + ValueIterator Erase(ConstValueIterator first, ConstValueIterator last) { + RAPIDJSON_ASSERT(IsArray()); + RAPIDJSON_ASSERT(data_.a.size > 0); + RAPIDJSON_ASSERT(GetElementsPointer() != 0); + RAPIDJSON_ASSERT(first >= Begin()); + RAPIDJSON_ASSERT(first <= last); + RAPIDJSON_ASSERT(last <= End()); + ValueIterator pos = Begin() + (first - Begin()); + for (ValueIterator itr = pos; itr != last; ++itr) + itr->~GenericValue(); + std::memmove(pos, last, static_cast<size_t>(End() - last) * sizeof(GenericValue)); + data_.a.size -= static_cast<SizeType>(last - first); + return pos; + } + + Array GetArray() { RAPIDJSON_ASSERT(IsArray()); return Array(*this); } + ConstArray GetArray() const { RAPIDJSON_ASSERT(IsArray()); return ConstArray(*this); } + + //@} + + //!@name Number + //@{ + + int GetInt() const { RAPIDJSON_ASSERT(data_.f.flags & kIntFlag); return data_.n.i.i; } + unsigned GetUint() const { RAPIDJSON_ASSERT(data_.f.flags & kUintFlag); return data_.n.u.u; } + int64_t GetInt64() const { RAPIDJSON_ASSERT(data_.f.flags & kInt64Flag); return data_.n.i64; } + uint64_t GetUint64() const { RAPIDJSON_ASSERT(data_.f.flags & kUint64Flag); return data_.n.u64; } + + //! Get the value as double type. + /*! \note If the value is 64-bit integer type, it may lose precision. Use \c IsLosslessDouble() to check whether the converison is lossless. + */ + double GetDouble() const { + RAPIDJSON_ASSERT(IsNumber()); + if ((data_.f.flags & kDoubleFlag) != 0) return data_.n.d; // exact type, no conversion. + if ((data_.f.flags & kIntFlag) != 0) return data_.n.i.i; // int -> double + if ((data_.f.flags & kUintFlag) != 0) return data_.n.u.u; // unsigned -> double + if ((data_.f.flags & kInt64Flag) != 0) return static_cast<double>(data_.n.i64); // int64_t -> double (may lose precision) + RAPIDJSON_ASSERT((data_.f.flags & kUint64Flag) != 0); return static_cast<double>(data_.n.u64); // uint64_t -> double (may lose precision) + } + + //! Get the value as float type. + /*! \note If the value is 64-bit integer type, it may lose precision. Use \c IsLosslessFloat() to check whether the converison is lossless. + */ + float GetFloat() const { + return static_cast<float>(GetDouble()); + } + + GenericValue& SetInt(int i) { this->~GenericValue(); new (this) GenericValue(i); return *this; } + GenericValue& SetUint(unsigned u) { this->~GenericValue(); new (this) GenericValue(u); return *this; } + GenericValue& SetInt64(int64_t i64) { this->~GenericValue(); new (this) GenericValue(i64); return *this; } + GenericValue& SetUint64(uint64_t u64) { this->~GenericValue(); new (this) GenericValue(u64); return *this; } + GenericValue& SetDouble(double d) { this->~GenericValue(); new (this) GenericValue(d); return *this; } + GenericValue& SetFloat(float f) { this->~GenericValue(); new (this) GenericValue(static_cast<double>(f)); return *this; } + + //@} + + //!@name String + //@{ + + const Ch* GetString() const { RAPIDJSON_ASSERT(IsString()); return (data_.f.flags & kInlineStrFlag) ? data_.ss.str : GetStringPointer(); } + + //! Get the length of string. + /*! Since rapidjson permits "\\u0000" in the json string, strlen(v.GetString()) may not equal to v.GetStringLength(). + */ + SizeType GetStringLength() const { RAPIDJSON_ASSERT(IsString()); return ((data_.f.flags & kInlineStrFlag) ? (data_.ss.GetLength()) : data_.s.length); } + + //! Set this value as a string without copying source string. + /*! This version has better performance with supplied length, and also support string containing null character. + \param s source string pointer. + \param length The length of source string, excluding the trailing null terminator. + \return The value itself for fluent API. + \post IsString() == true && GetString() == s && GetStringLength() == length + \see SetString(StringRefType) + */ + GenericValue& SetString(const Ch* s, SizeType length) { return SetString(StringRef(s, length)); } + + //! Set this value as a string without copying source string. + /*! \param s source string reference + \return The value itself for fluent API. + \post IsString() == true && GetString() == s && GetStringLength() == s.length + */ + GenericValue& SetString(StringRefType s) { this->~GenericValue(); SetStringRaw(s); return *this; } + + //! Set this value as a string by copying from source string. + /*! This version has better performance with supplied length, and also support string containing null character. + \param s source string. + \param length The length of source string, excluding the trailing null terminator. + \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \post IsString() == true && GetString() != s && strcmp(GetString(),s) == 0 && GetStringLength() == length + */ + GenericValue& SetString(const Ch* s, SizeType length, Allocator& allocator) { this->~GenericValue(); SetStringRaw(StringRef(s, length), allocator); return *this; } + + //! Set this value as a string by copying from source string. + /*! \param s source string. + \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \post IsString() == true && GetString() != s && strcmp(GetString(),s) == 0 && GetStringLength() == length + */ + GenericValue& SetString(const Ch* s, Allocator& allocator) { return SetString(s, internal::StrLen(s), allocator); } + +#if RAPIDJSON_HAS_STDSTRING + //! Set this value as a string by copying from source string. + /*! \param s source string. + \param allocator Allocator for allocating copied buffer. Commonly use GenericDocument::GetAllocator(). + \return The value itself for fluent API. + \post IsString() == true && GetString() != s.data() && strcmp(GetString(),s.data() == 0 && GetStringLength() == s.size() + \note Requires the definition of the preprocessor symbol \ref RAPIDJSON_HAS_STDSTRING. + */ + GenericValue& SetString(const std::basic_string<Ch>& s, Allocator& allocator) { return SetString(s.data(), SizeType(s.size()), allocator); } +#endif + + //@} + + //!@name Array + //@{ + + //! Templated version for checking whether this value is type T. + /*! + \tparam T Either \c bool, \c int, \c unsigned, \c int64_t, \c uint64_t, \c double, \c float, \c const \c char*, \c std::basic_string<Ch> + */ + template <typename T> + bool Is() const { return internal::TypeHelper<ValueType, T>::Is(*this); } + + template <typename T> + T Get() const { return internal::TypeHelper<ValueType, T>::Get(*this); } + + template <typename T> + T Get() { return internal::TypeHelper<ValueType, T>::Get(*this); } + + template<typename T> + ValueType& Set(const T& data) { return internal::TypeHelper<ValueType, T>::Set(*this, data); } + + template<typename T> + ValueType& Set(const T& data, AllocatorType& allocator) { return internal::TypeHelper<ValueType, T>::Set(*this, data, allocator); } + + //@} + + //! Generate events of this value to a Handler. + /*! This function adopts the GoF visitor pattern. + Typical usage is to output this JSON value as JSON text via Writer, which is a Handler. + It can also be used to deep clone this value via GenericDocument, which is also a Handler. + \tparam Handler type of handler. + \param handler An object implementing concept Handler. + */ + template <typename Handler> + bool Accept(Handler& handler) const { + switch(GetType()) { + case kNullType: return handler.Null(); + case kFalseType: return handler.Bool(false); + case kTrueType: return handler.Bool(true); + + case kObjectType: + if (RAPIDJSON_UNLIKELY(!handler.StartObject())) + return false; + for (ConstMemberIterator m = MemberBegin(); m != MemberEnd(); ++m) { + RAPIDJSON_ASSERT(m->name.IsString()); // User may change the type of name by MemberIterator. + if (RAPIDJSON_UNLIKELY(!handler.Key(m->name.GetString(), m->name.GetStringLength(), (m->name.data_.f.flags & kCopyFlag) != 0))) + return false; + if (RAPIDJSON_UNLIKELY(!m->value.Accept(handler))) + return false; + } + return handler.EndObject(data_.o.size); + + case kArrayType: + if (RAPIDJSON_UNLIKELY(!handler.StartArray())) + return false; + for (const GenericValue* v = Begin(); v != End(); ++v) + if (RAPIDJSON_UNLIKELY(!v->Accept(handler))) + return false; + return handler.EndArray(data_.a.size); + + case kStringType: + return handler.String(GetString(), GetStringLength(), (data_.f.flags & kCopyFlag) != 0); + + default: + RAPIDJSON_ASSERT(GetType() == kNumberType); + if (IsDouble()) return handler.Double(data_.n.d); + else if (IsInt()) return handler.Int(data_.n.i.i); + else if (IsUint()) return handler.Uint(data_.n.u.u); + else if (IsInt64()) return handler.Int64(data_.n.i64); + else return handler.Uint64(data_.n.u64); + } + } + +private: + template <typename, typename> friend class GenericValue; + template <typename, typename, typename> friend class GenericDocument; + + enum { + kBoolFlag = 0x0008, + kNumberFlag = 0x0010, + kIntFlag = 0x0020, + kUintFlag = 0x0040, + kInt64Flag = 0x0080, + kUint64Flag = 0x0100, + kDoubleFlag = 0x0200, + kStringFlag = 0x0400, + kCopyFlag = 0x0800, + kInlineStrFlag = 0x1000, + + // Initial flags of different types. + kNullFlag = kNullType, + kTrueFlag = kTrueType | kBoolFlag, + kFalseFlag = kFalseType | kBoolFlag, + kNumberIntFlag = kNumberType | kNumberFlag | kIntFlag | kInt64Flag, + kNumberUintFlag = kNumberType | kNumberFlag | kUintFlag | kUint64Flag | kInt64Flag, + kNumberInt64Flag = kNumberType | kNumberFlag | kInt64Flag, + kNumberUint64Flag = kNumberType | kNumberFlag | kUint64Flag, + kNumberDoubleFlag = kNumberType | kNumberFlag | kDoubleFlag, + kNumberAnyFlag = kNumberType | kNumberFlag | kIntFlag | kInt64Flag | kUintFlag | kUint64Flag | kDoubleFlag, + kConstStringFlag = kStringType | kStringFlag, + kCopyStringFlag = kStringType | kStringFlag | kCopyFlag, + kShortStringFlag = kStringType | kStringFlag | kCopyFlag | kInlineStrFlag, + kObjectFlag = kObjectType, + kArrayFlag = kArrayType, + + kTypeMask = 0x07 + }; + + static const SizeType kDefaultArrayCapacity = 16; + static const SizeType kDefaultObjectCapacity = 16; + + struct Flag { +#if RAPIDJSON_48BITPOINTER_OPTIMIZATION + char payload[sizeof(SizeType) * 2 + 6]; // 2 x SizeType + lower 48-bit pointer +#elif RAPIDJSON_64BIT + char payload[sizeof(SizeType) * 2 + sizeof(void*) + 6]; // 6 padding bytes +#else + char payload[sizeof(SizeType) * 2 + sizeof(void*) + 2]; // 2 padding bytes +#endif + uint16_t flags; + }; + + struct String { + SizeType length; + SizeType hashcode; //!< reserved + const Ch* str; + }; // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + // implementation detail: ShortString can represent zero-terminated strings up to MaxSize chars + // (excluding the terminating zero) and store a value to determine the length of the contained + // string in the last character str[LenPos] by storing "MaxSize - length" there. If the string + // to store has the maximal length of MaxSize then str[LenPos] will be 0 and therefore act as + // the string terminator as well. For getting the string length back from that value just use + // "MaxSize - str[LenPos]". + // This allows to store 13-chars strings in 32-bit mode, 21-chars strings in 64-bit mode, + // 13-chars strings for RAPIDJSON_48BITPOINTER_OPTIMIZATION=1 inline (for `UTF8`-encoded strings). + struct ShortString { + enum { MaxChars = sizeof(static_cast<Flag*>(0)->payload) / sizeof(Ch), MaxSize = MaxChars - 1, LenPos = MaxSize }; + Ch str[MaxChars]; + + inline static bool Usable(SizeType len) { return (MaxSize >= len); } + inline void SetLength(SizeType len) { str[LenPos] = static_cast<Ch>(MaxSize - len); } + inline SizeType GetLength() const { return static_cast<SizeType>(MaxSize - str[LenPos]); } + }; // at most as many bytes as "String" above => 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + // By using proper binary layout, retrieval of different integer types do not need conversions. + union Number { +#if RAPIDJSON_ENDIAN == RAPIDJSON_LITTLEENDIAN + struct I { + int i; + char padding[4]; + }i; + struct U { + unsigned u; + char padding2[4]; + }u; +#else + struct I { + char padding[4]; + int i; + }i; + struct U { + char padding2[4]; + unsigned u; + }u; +#endif + int64_t i64; + uint64_t u64; + double d; + }; // 8 bytes + + struct ObjectData { + SizeType size; + SizeType capacity; + Member* members; + }; // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + struct ArrayData { + SizeType size; + SizeType capacity; + GenericValue* elements; + }; // 12 bytes in 32-bit mode, 16 bytes in 64-bit mode + + union Data { + String s; + ShortString ss; + Number n; + ObjectData o; + ArrayData a; + Flag f; + }; // 16 bytes in 32-bit mode, 24 bytes in 64-bit mode, 16 bytes in 64-bit with RAPIDJSON_48BITPOINTER_OPTIMIZATION + + RAPIDJSON_FORCEINLINE const Ch* GetStringPointer() const { return RAPIDJSON_GETPOINTER(Ch, data_.s.str); } + RAPIDJSON_FORCEINLINE const Ch* SetStringPointer(const Ch* str) { return RAPIDJSON_SETPOINTER(Ch, data_.s.str, str); } + RAPIDJSON_FORCEINLINE GenericValue* GetElementsPointer() const { return RAPIDJSON_GETPOINTER(GenericValue, data_.a.elements); } + RAPIDJSON_FORCEINLINE GenericValue* SetElementsPointer(GenericValue* elements) { return RAPIDJSON_SETPOINTER(GenericValue, data_.a.elements, elements); } + RAPIDJSON_FORCEINLINE Member* GetMembersPointer() const { return RAPIDJSON_GETPOINTER(Member, data_.o.members); } + RAPIDJSON_FORCEINLINE Member* SetMembersPointer(Member* members) { return RAPIDJSON_SETPOINTER(Member, data_.o.members, members); } + + // Initialize this value as array with initial data, without calling destructor. + void SetArrayRaw(GenericValue* values, SizeType count, Allocator& allocator) { + data_.f.flags = kArrayFlag; + if (count) { + GenericValue* e = static_cast<GenericValue*>(allocator.Malloc(count * sizeof(GenericValue))); + SetElementsPointer(e); + std::memcpy(e, values, count * sizeof(GenericValue)); + } + else + SetElementsPointer(0); + data_.a.size = data_.a.capacity = count; + } + + //! Initialize this value as object with initial data, without calling destructor. + void SetObjectRaw(Member* members, SizeType count, Allocator& allocator) { + data_.f.flags = kObjectFlag; + if (count) { + Member* m = static_cast<Member*>(allocator.Malloc(count * sizeof(Member))); + SetMembersPointer(m); + std::memcpy(m, members, count * sizeof(Member)); + } + else + SetMembersPointer(0); + data_.o.size = data_.o.capacity = count; + } + + //! Initialize this value as constant string, without calling destructor. + void SetStringRaw(StringRefType s) RAPIDJSON_NOEXCEPT { + data_.f.flags = kConstStringFlag; + SetStringPointer(s); + data_.s.length = s.length; + } + + //! Initialize this value as copy string with initial data, without calling destructor. + void SetStringRaw(StringRefType s, Allocator& allocator) { + Ch* str = 0; + if (ShortString::Usable(s.length)) { + data_.f.flags = kShortStringFlag; + data_.ss.SetLength(s.length); + str = data_.ss.str; + } else { + data_.f.flags = kCopyStringFlag; + data_.s.length = s.length; + str = static_cast<Ch *>(allocator.Malloc((s.length + 1) * sizeof(Ch))); + SetStringPointer(str); + } + std::memcpy(str, s, s.length * sizeof(Ch)); + str[s.length] = '\0'; + } + + //! Assignment without calling destructor + void RawAssign(GenericValue& rhs) RAPIDJSON_NOEXCEPT { + data_ = rhs.data_; + // data_.f.flags = rhs.data_.f.flags; + rhs.data_.f.flags = kNullFlag; + } + + template <typename SourceAllocator> + bool StringEqual(const GenericValue<Encoding, SourceAllocator>& rhs) const { + RAPIDJSON_ASSERT(IsString()); + RAPIDJSON_ASSERT(rhs.IsString()); + + const SizeType len1 = GetStringLength(); + const SizeType len2 = rhs.GetStringLength(); + if(len1 != len2) { return false; } + + const Ch* const str1 = GetString(); + const Ch* const str2 = rhs.GetString(); + if(str1 == str2) { return true; } // fast path for constant string + + return (std::memcmp(str1, str2, sizeof(Ch) * len1) == 0); + } + + Data data_; +}; + +//! GenericValue with UTF8 encoding +typedef GenericValue<UTF8<> > Value; + +/////////////////////////////////////////////////////////////////////////////// +// GenericDocument + +//! A document for parsing JSON text as DOM. +/*! + \note implements Handler concept + \tparam Encoding Encoding for both parsing and string storage. + \tparam Allocator Allocator for allocating memory for the DOM + \tparam StackAllocator Allocator for allocating memory for stack during parsing. + \warning Although GenericDocument inherits from GenericValue, the API does \b not provide any virtual functions, especially no virtual destructor. To avoid memory leaks, do not \c delete a GenericDocument object via a pointer to a GenericValue. +*/ +template <typename Encoding, typename Allocator = MemoryPoolAllocator<>, typename StackAllocator = CrtAllocator> +class GenericDocument : public GenericValue<Encoding, Allocator> { +public: + typedef typename Encoding::Ch Ch; //!< Character type derived from Encoding. + typedef GenericValue<Encoding, Allocator> ValueType; //!< Value type of the document. + typedef Allocator AllocatorType; //!< Allocator type from template parameter. + + //! Constructor + /*! Creates an empty document of specified type. + \param type Mandatory type of object to create. + \param allocator Optional allocator for allocating memory. + \param stackCapacity Optional initial capacity of stack in bytes. + \param stackAllocator Optional allocator for allocating memory for stack. + */ + explicit GenericDocument(Type type, Allocator* allocator = 0, size_t stackCapacity = kDefaultStackCapacity, StackAllocator* stackAllocator = 0) : + GenericValue<Encoding, Allocator>(type), allocator_(allocator), ownAllocator_(0), stack_(stackAllocator, stackCapacity), parseResult_() + { + if (!allocator_) + ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)(); + } + + //! Constructor + /*! Creates an empty document which type is Null. + \param allocator Optional allocator for allocating memory. + \param stackCapacity Optional initial capacity of stack in bytes. + \param stackAllocator Optional allocator for allocating memory for stack. + */ + GenericDocument(Allocator* allocator = 0, size_t stackCapacity = kDefaultStackCapacity, StackAllocator* stackAllocator = 0) : + allocator_(allocator), ownAllocator_(0), stack_(stackAllocator, stackCapacity), parseResult_() + { + if (!allocator_) + ownAllocator_ = allocator_ = RAPIDJSON_NEW(Allocator)(); + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Move constructor in C++11 + GenericDocument(GenericDocument&& rhs) RAPIDJSON_NOEXCEPT + : ValueType(std::forward<ValueType>(rhs)), // explicit cast to avoid prohibited move from Document + allocator_(rhs.allocator_), + ownAllocator_(rhs.ownAllocator_), + stack_(std::move(rhs.stack_)), + parseResult_(rhs.parseResult_) + { + rhs.allocator_ = 0; + rhs.ownAllocator_ = 0; + rhs.parseResult_ = ParseResult(); + } +#endif + + ~GenericDocument() { + Destroy(); + } + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + //! Move assignment in C++11 + GenericDocument& operator=(GenericDocument&& rhs) RAPIDJSON_NOEXCEPT + { + // The cast to ValueType is necessary here, because otherwise it would + // attempt to call GenericValue's templated assignment operator. + ValueType::operator=(std::forward<ValueType>(rhs)); + + // Calling the destructor here would prematurely call stack_'s destructor + Destroy(); + + allocator_ = rhs.allocator_; + ownAllocator_ = rhs.ownAllocator_; + stack_ = std::move(rhs.stack_); + parseResult_ = rhs.parseResult_; + + rhs.allocator_ = 0; + rhs.ownAllocator_ = 0; + rhs.parseResult_ = ParseResult(); + + return *this; + } +#endif + + //! Exchange the contents of this document with those of another. + /*! + \param rhs Another document. + \note Constant complexity. + \see GenericValue::Swap + */ + GenericDocument& Swap(GenericDocument& rhs) RAPIDJSON_NOEXCEPT { + ValueType::Swap(rhs); + stack_.Swap(rhs.stack_); + internal::Swap(allocator_, rhs.allocator_); + internal::Swap(ownAllocator_, rhs.ownAllocator_); + internal::Swap(parseResult_, rhs.parseResult_); + return *this; + } + + //! free-standing swap function helper + /*! + Helper function to enable support for common swap implementation pattern based on \c std::swap: + \code + void swap(MyClass& a, MyClass& b) { + using std::swap; + swap(a.doc, b.doc); + // ... + } + \endcode + \see Swap() + */ + friend inline void swap(GenericDocument& a, GenericDocument& b) RAPIDJSON_NOEXCEPT { a.Swap(b); } + + //! Populate this document by a generator which produces SAX events. + /*! \tparam Generator A functor with <tt>bool f(Handler)</tt> prototype. + \param g Generator functor which sends SAX events to the parameter. + \return The document itself for fluent API. + */ + template <typename Generator> + GenericDocument& Populate(Generator& g) { + ClearStackOnExit scope(*this); + if (g(*this)) { + RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object + ValueType::operator=(*stack_.template Pop<ValueType>(1));// Move value from stack to document + } + return *this; + } + + //!@name Parse from stream + //!@{ + + //! Parse JSON text from an input stream (with Encoding conversion) + /*! \tparam parseFlags Combination of \ref ParseFlag. + \tparam SourceEncoding Encoding of input stream + \tparam InputStream Type of input stream, implementing Stream concept + \param is Input stream to be parsed. + \return The document itself for fluent API. + */ + template <unsigned parseFlags, typename SourceEncoding, typename InputStream> + GenericDocument& ParseStream(InputStream& is) { + GenericReader<SourceEncoding, Encoding, StackAllocator> reader( + stack_.HasAllocator() ? &stack_.GetAllocator() : 0); + ClearStackOnExit scope(*this); + parseResult_ = reader.template Parse<parseFlags>(is, *this); + if (parseResult_) { + RAPIDJSON_ASSERT(stack_.GetSize() == sizeof(ValueType)); // Got one and only one root object + ValueType::operator=(*stack_.template Pop<ValueType>(1));// Move value from stack to document + } + return *this; + } + + //! Parse JSON text from an input stream + /*! \tparam parseFlags Combination of \ref ParseFlag. + \tparam InputStream Type of input stream, implementing Stream concept + \param is Input stream to be parsed. + \return The document itself for fluent API. + */ + template <unsigned parseFlags, typename InputStream> + GenericDocument& ParseStream(InputStream& is) { + return ParseStream<parseFlags, Encoding, InputStream>(is); + } + + //! Parse JSON text from an input stream (with \ref kParseDefaultFlags) + /*! \tparam InputStream Type of input stream, implementing Stream concept + \param is Input stream to be parsed. + \return The document itself for fluent API. + */ + template <typename InputStream> + GenericDocument& ParseStream(InputStream& is) { + return ParseStream<kParseDefaultFlags, Encoding, InputStream>(is); + } + //!@} + + //!@name Parse in-place from mutable string + //!@{ + + //! Parse JSON text from a mutable string + /*! \tparam parseFlags Combination of \ref ParseFlag. + \param str Mutable zero-terminated string to be parsed. + \return The document itself for fluent API. + */ + template <unsigned parseFlags> + GenericDocument& ParseInsitu(Ch* str) { + GenericInsituStringStream<Encoding> s(str); + return ParseStream<parseFlags | kParseInsituFlag>(s); + } + + //! Parse JSON text from a mutable string (with \ref kParseDefaultFlags) + /*! \param str Mutable zero-terminated string to be parsed. + \return The document itself for fluent API. + */ + GenericDocument& ParseInsitu(Ch* str) { + return ParseInsitu<kParseDefaultFlags>(str); + } + //!@} + + //!@name Parse from read-only string + //!@{ + + //! Parse JSON text from a read-only string (with Encoding conversion) + /*! \tparam parseFlags Combination of \ref ParseFlag (must not contain \ref kParseInsituFlag). + \tparam SourceEncoding Transcoding from input Encoding + \param str Read-only zero-terminated string to be parsed. + */ + template <unsigned parseFlags, typename SourceEncoding> + GenericDocument& Parse(const typename SourceEncoding::Ch* str) { + RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag)); + GenericStringStream<SourceEncoding> s(str); + return ParseStream<parseFlags, SourceEncoding>(s); + } + + //! Parse JSON text from a read-only string + /*! \tparam parseFlags Combination of \ref ParseFlag (must not contain \ref kParseInsituFlag). + \param str Read-only zero-terminated string to be parsed. + */ + template <unsigned parseFlags> + GenericDocument& Parse(const Ch* str) { + return Parse<parseFlags, Encoding>(str); + } + + //! Parse JSON text from a read-only string (with \ref kParseDefaultFlags) + /*! \param str Read-only zero-terminated string to be parsed. + */ + GenericDocument& Parse(const Ch* str) { + return Parse<kParseDefaultFlags>(str); + } + + template <unsigned parseFlags, typename SourceEncoding> + GenericDocument& Parse(const typename SourceEncoding::Ch* str, size_t length) { + RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag)); + MemoryStream ms(reinterpret_cast<const char*>(str), length * sizeof(typename SourceEncoding::Ch)); + EncodedInputStream<SourceEncoding, MemoryStream> is(ms); + ParseStream<parseFlags, SourceEncoding>(is); + return *this; + } + + template <unsigned parseFlags> + GenericDocument& Parse(const Ch* str, size_t length) { + return Parse<parseFlags, Encoding>(str, length); + } + + GenericDocument& Parse(const Ch* str, size_t length) { + return Parse<kParseDefaultFlags>(str, length); + } + +#if RAPIDJSON_HAS_STDSTRING + template <unsigned parseFlags, typename SourceEncoding> + GenericDocument& Parse(const std::basic_string<typename SourceEncoding::Ch>& str) { + // c_str() is constant complexity according to standard. Should be faster than Parse(const char*, size_t) + return Parse<parseFlags, SourceEncoding>(str.c_str()); + } + + template <unsigned parseFlags> + GenericDocument& Parse(const std::basic_string<Ch>& str) { + return Parse<parseFlags, Encoding>(str.c_str()); + } + + GenericDocument& Parse(const std::basic_string<Ch>& str) { + return Parse<kParseDefaultFlags>(str); + } +#endif // RAPIDJSON_HAS_STDSTRING + + //!@} + + //!@name Handling parse errors + //!@{ + + //! Whether a parse error has occured in the last parsing. + bool HasParseError() const { return parseResult_.IsError(); } + + //! Get the \ref ParseErrorCode of last parsing. + ParseErrorCode GetParseError() const { return parseResult_.Code(); } + + //! Get the position of last parsing error in input, 0 otherwise. + size_t GetErrorOffset() const { return parseResult_.Offset(); } + + //! Implicit conversion to get the last parse result +#ifndef __clang // -Wdocumentation + /*! \return \ref ParseResult of the last parse operation + + \code + Document doc; + ParseResult ok = doc.Parse(json); + if (!ok) + printf( "JSON parse error: %s (%u)\n", GetParseError_En(ok.Code()), ok.Offset()); + \endcode + */ +#endif + operator ParseResult() const { return parseResult_; } + //!@} + + //! Get the allocator of this document. + Allocator& GetAllocator() { + RAPIDJSON_ASSERT(allocator_); + return *allocator_; + } + + //! Get the capacity of stack in bytes. + size_t GetStackCapacity() const { return stack_.GetCapacity(); } + +private: + // clear stack on any exit from ParseStream, e.g. due to exception + struct ClearStackOnExit { + explicit ClearStackOnExit(GenericDocument& d) : d_(d) {} + ~ClearStackOnExit() { d_.ClearStack(); } + private: + ClearStackOnExit(const ClearStackOnExit&); + ClearStackOnExit& operator=(const ClearStackOnExit&); + GenericDocument& d_; + }; + + // callers of the following private Handler functions + // template <typename,typename,typename> friend class GenericReader; // for parsing + template <typename, typename> friend class GenericValue; // for deep copying + +public: + // Implementation of Handler + bool Null() { new (stack_.template Push<ValueType>()) ValueType(); return true; } + bool Bool(bool b) { new (stack_.template Push<ValueType>()) ValueType(b); return true; } + bool Int(int i) { new (stack_.template Push<ValueType>()) ValueType(i); return true; } + bool Uint(unsigned i) { new (stack_.template Push<ValueType>()) ValueType(i); return true; } + bool Int64(int64_t i) { new (stack_.template Push<ValueType>()) ValueType(i); return true; } + bool Uint64(uint64_t i) { new (stack_.template Push<ValueType>()) ValueType(i); return true; } + bool Double(double d) { new (stack_.template Push<ValueType>()) ValueType(d); return true; } + + bool RawNumber(const Ch* str, SizeType length, bool copy) { + if (copy) + new (stack_.template Push<ValueType>()) ValueType(str, length, GetAllocator()); + else + new (stack_.template Push<ValueType>()) ValueType(str, length); + return true; + } + + bool String(const Ch* str, SizeType length, bool copy) { + if (copy) + new (stack_.template Push<ValueType>()) ValueType(str, length, GetAllocator()); + else + new (stack_.template Push<ValueType>()) ValueType(str, length); + return true; + } + + bool StartObject() { new (stack_.template Push<ValueType>()) ValueType(kObjectType); return true; } + + bool Key(const Ch* str, SizeType length, bool copy) { return String(str, length, copy); } + + bool EndObject(SizeType memberCount) { + typename ValueType::Member* members = stack_.template Pop<typename ValueType::Member>(memberCount); + stack_.template Top<ValueType>()->SetObjectRaw(members, memberCount, GetAllocator()); + return true; + } + + bool StartArray() { new (stack_.template Push<ValueType>()) ValueType(kArrayType); return true; } + + bool EndArray(SizeType elementCount) { + ValueType* elements = stack_.template Pop<ValueType>(elementCount); + stack_.template Top<ValueType>()->SetArrayRaw(elements, elementCount, GetAllocator()); + return true; + } + +private: + //! Prohibit copying + GenericDocument(const GenericDocument&); + //! Prohibit assignment + GenericDocument& operator=(const GenericDocument&); + + void ClearStack() { + if (Allocator::kNeedFree) + while (stack_.GetSize() > 0) // Here assumes all elements in stack array are GenericValue (Member is actually 2 GenericValue objects) + (stack_.template Pop<ValueType>(1))->~ValueType(); + else + stack_.Clear(); + stack_.ShrinkToFit(); + } + + void Destroy() { + RAPIDJSON_DELETE(ownAllocator_); + } + + static const size_t kDefaultStackCapacity = 1024; + Allocator* allocator_; + Allocator* ownAllocator_; + internal::Stack<StackAllocator> stack_; + ParseResult parseResult_; +}; + +//! GenericDocument with UTF8 encoding +typedef GenericDocument<UTF8<> > Document; + +//! Helper class for accessing Value of array type. +/*! + Instance of this helper class is obtained by \c GenericValue::GetArray(). + In addition to all APIs for array type, it provides range-based for loop if \c RAPIDJSON_HAS_CXX11_RANGE_FOR=1. +*/ +template <bool Const, typename ValueT> +class GenericArray { +public: + typedef GenericArray<true, ValueT> ConstArray; + typedef GenericArray<false, ValueT> Array; + typedef ValueT PlainType; + typedef typename internal::MaybeAddConst<Const,PlainType>::Type ValueType; + typedef ValueType* ValueIterator; // This may be const or non-const iterator + typedef const ValueT* ConstValueIterator; + typedef typename ValueType::AllocatorType AllocatorType; + typedef typename ValueType::StringRefType StringRefType; + + template <typename, typename> + friend class GenericValue; + + GenericArray(const GenericArray& rhs) : value_(rhs.value_) {} + GenericArray& operator=(const GenericArray& rhs) { value_ = rhs.value_; return *this; } + ~GenericArray() {} + + SizeType Size() const { return value_.Size(); } + SizeType Capacity() const { return value_.Capacity(); } + bool Empty() const { return value_.Empty(); } + void Clear() const { value_.Clear(); } + ValueType& operator[](SizeType index) const { return value_[index]; } + ValueIterator Begin() const { return value_.Begin(); } + ValueIterator End() const { return value_.End(); } + GenericArray Reserve(SizeType newCapacity, AllocatorType &allocator) const { value_.Reserve(newCapacity, allocator); return *this; } + GenericArray PushBack(ValueType& value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; } +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericArray PushBack(ValueType&& value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; } +#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericArray PushBack(StringRefType value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; } + template <typename T> RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (const GenericArray&)) PushBack(T value, AllocatorType& allocator) const { value_.PushBack(value, allocator); return *this; } + GenericArray PopBack() const { value_.PopBack(); return *this; } + ValueIterator Erase(ConstValueIterator pos) const { return value_.Erase(pos); } + ValueIterator Erase(ConstValueIterator first, ConstValueIterator last) const { return value_.Erase(first, last); } + +#if RAPIDJSON_HAS_CXX11_RANGE_FOR + ValueIterator begin() const { return value_.Begin(); } + ValueIterator end() const { return value_.End(); } +#endif + +private: + GenericArray(); + GenericArray(ValueType& value) : value_(value) {} + ValueType& value_; +}; + +//! Helper class for accessing Value of object type. +/*! + Instance of this helper class is obtained by \c GenericValue::GetObject(). + In addition to all APIs for array type, it provides range-based for loop if \c RAPIDJSON_HAS_CXX11_RANGE_FOR=1. +*/ +template <bool Const, typename ValueT> +class GenericObject { +public: + typedef GenericObject<true, ValueT> ConstObject; + typedef GenericObject<false, ValueT> Object; + typedef ValueT PlainType; + typedef typename internal::MaybeAddConst<Const,PlainType>::Type ValueType; + typedef GenericMemberIterator<Const, typename ValueT::EncodingType, typename ValueT::AllocatorType> MemberIterator; // This may be const or non-const iterator + typedef GenericMemberIterator<true, typename ValueT::EncodingType, typename ValueT::AllocatorType> ConstMemberIterator; + typedef typename ValueType::AllocatorType AllocatorType; + typedef typename ValueType::StringRefType StringRefType; + typedef typename ValueType::EncodingType EncodingType; + typedef typename ValueType::Ch Ch; + + template <typename, typename> + friend class GenericValue; + + GenericObject(const GenericObject& rhs) : value_(rhs.value_) {} + GenericObject& operator=(const GenericObject& rhs) { value_ = rhs.value_; return *this; } + ~GenericObject() {} + + SizeType MemberCount() const { return value_.MemberCount(); } + bool ObjectEmpty() const { return value_.ObjectEmpty(); } + template <typename T> ValueType& operator[](T* name) const { return value_[name]; } + template <typename SourceAllocator> ValueType& operator[](const GenericValue<EncodingType, SourceAllocator>& name) const { return value_[name]; } +#if RAPIDJSON_HAS_STDSTRING + ValueType& operator[](const std::basic_string<Ch>& name) const { return value_[name]; } +#endif + MemberIterator MemberBegin() const { return value_.MemberBegin(); } + MemberIterator MemberEnd() const { return value_.MemberEnd(); } + bool HasMember(const Ch* name) const { return value_.HasMember(name); } +#if RAPIDJSON_HAS_STDSTRING + bool HasMember(const std::basic_string<Ch>& name) const { return value_.HasMember(name); } +#endif + template <typename SourceAllocator> bool HasMember(const GenericValue<EncodingType, SourceAllocator>& name) const { return value_.HasMember(name); } + MemberIterator FindMember(const Ch* name) const { return value_.FindMember(name); } + template <typename SourceAllocator> MemberIterator FindMember(const GenericValue<EncodingType, SourceAllocator>& name) const { return value_.FindMember(name); } +#if RAPIDJSON_HAS_STDSTRING + MemberIterator FindMember(const std::basic_string<Ch>& name) const { return value_.FindMember(name); } +#endif + GenericObject AddMember(ValueType& name, ValueType& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + GenericObject AddMember(ValueType& name, StringRefType value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } +#if RAPIDJSON_HAS_STDSTRING + GenericObject AddMember(ValueType& name, std::basic_string<Ch>& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } +#endif + template <typename T> RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (ValueType&)) AddMember(ValueType& name, T value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericObject AddMember(ValueType&& name, ValueType&& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + GenericObject AddMember(ValueType&& name, ValueType& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + GenericObject AddMember(ValueType& name, ValueType&& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + GenericObject AddMember(StringRefType name, ValueType&& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } +#endif // RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericObject AddMember(StringRefType name, ValueType& value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + GenericObject AddMember(StringRefType name, StringRefType value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + template <typename T> RAPIDJSON_DISABLEIF_RETURN((internal::OrExpr<internal::IsPointer<T>, internal::IsGenericValue<T> >), (GenericObject)) AddMember(StringRefType name, T value, AllocatorType& allocator) const { value_.AddMember(name, value, allocator); return *this; } + void RemoveAllMembers() { value_.RemoveAllMembers(); } + bool RemoveMember(const Ch* name) const { return value_.RemoveMember(name); } +#if RAPIDJSON_HAS_STDSTRING + bool RemoveMember(const std::basic_string<Ch>& name) const { return value_.RemoveMember(name); } +#endif + template <typename SourceAllocator> bool RemoveMember(const GenericValue<EncodingType, SourceAllocator>& name) const { return value_.RemoveMember(name); } + MemberIterator RemoveMember(MemberIterator m) const { return value_.RemoveMember(m); } + MemberIterator EraseMember(ConstMemberIterator pos) const { return value_.EraseMember(pos); } + MemberIterator EraseMember(ConstMemberIterator first, ConstMemberIterator last) const { return value_.EraseMember(first, last); } + bool EraseMember(const Ch* name) const { return value_.EraseMember(name); } +#if RAPIDJSON_HAS_STDSTRING + bool EraseMember(const std::basic_string<Ch>& name) const { return EraseMember(ValueType(StringRef(name))); } +#endif + template <typename SourceAllocator> bool EraseMember(const GenericValue<EncodingType, SourceAllocator>& name) const { return value_.EraseMember(name); } + +#if RAPIDJSON_HAS_CXX11_RANGE_FOR + MemberIterator begin() const { return value_.MemberBegin(); } + MemberIterator end() const { return value_.MemberEnd(); } +#endif + +private: + GenericObject(); + GenericObject(ValueType& value) : value_(value) {} + ValueType& value_; +}; + +RAPIDJSON_NAMESPACE_END +#ifdef _MINWINDEF_ // see: http://stackoverflow.com/questions/22744262/cant-call-stdmax-because-minwindef-h-defines-max +#ifndef NOMINMAX +#pragma pop_macro("min") +#pragma pop_macro("max") +#endif +#endif +RAPIDJSON_DIAG_POP + +#endif // RAPIDJSON_DOCUMENT_H_ diff --git a/contrib/libs/rapidjson/include/rapidjson/internal/dtoa.h b/contrib/libs/rapidjson/include/rapidjson/internal/dtoa.h new file mode 100644 index 0000000000..bf2e9b2e59 --- /dev/null +++ b/contrib/libs/rapidjson/include/rapidjson/internal/dtoa.h @@ -0,0 +1,245 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// This is a C++ header-only implementation of Grisu2 algorithm from the publication: +// Loitsch, Florian. "Printing floating-point numbers quickly and accurately with +// integers." ACM Sigplan Notices 45.6 (2010): 233-243. + +#ifndef RAPIDJSON_DTOA_ +#define RAPIDJSON_DTOA_ + +#include "itoa.h" // GetDigitsLut() +#include "diyfp.h" +#include "ieee754.h" + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +#ifdef __GNUC__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(effc++) +RAPIDJSON_DIAG_OFF(array-bounds) // some gcc versions generate wrong warnings https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124 +#endif + +inline void GrisuRound(char* buffer, int len, uint64_t delta, uint64_t rest, uint64_t ten_kappa, uint64_t wp_w) { + while (rest < wp_w && delta - rest >= ten_kappa && + (rest + ten_kappa < wp_w || /// closer + wp_w - rest > rest + ten_kappa - wp_w)) { + buffer[len - 1]--; + rest += ten_kappa; + } +} + +inline int CountDecimalDigit32(uint32_t n) { + // Simple pure C++ implementation was faster than __builtin_clz version in this situation. + if (n < 10) return 1; + if (n < 100) return 2; + if (n < 1000) return 3; + if (n < 10000) return 4; + if (n < 100000) return 5; + if (n < 1000000) return 6; + if (n < 10000000) return 7; + if (n < 100000000) return 8; + // Will not reach 10 digits in DigitGen() + //if (n < 1000000000) return 9; + //return 10; + return 9; +} + +inline void DigitGen(const DiyFp& W, const DiyFp& Mp, uint64_t delta, char* buffer, int* len, int* K) { + static const uint32_t kPow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 }; + const DiyFp one(uint64_t(1) << -Mp.e, Mp.e); + const DiyFp wp_w = Mp - W; + uint32_t p1 = static_cast<uint32_t>(Mp.f >> -one.e); + uint64_t p2 = Mp.f & (one.f - 1); + int kappa = CountDecimalDigit32(p1); // kappa in [0, 9] + *len = 0; + + while (kappa > 0) { + uint32_t d = 0; + switch (kappa) { + case 9: d = p1 / 100000000; p1 %= 100000000; break; + case 8: d = p1 / 10000000; p1 %= 10000000; break; + case 7: d = p1 / 1000000; p1 %= 1000000; break; + case 6: d = p1 / 100000; p1 %= 100000; break; + case 5: d = p1 / 10000; p1 %= 10000; break; + case 4: d = p1 / 1000; p1 %= 1000; break; + case 3: d = p1 / 100; p1 %= 100; break; + case 2: d = p1 / 10; p1 %= 10; break; + case 1: d = p1; p1 = 0; break; + default:; + } + if (d || *len) + buffer[(*len)++] = static_cast<char>('0' + static_cast<char>(d)); + kappa--; + uint64_t tmp = (static_cast<uint64_t>(p1) << -one.e) + p2; + if (tmp <= delta) { + *K += kappa; + GrisuRound(buffer, *len, delta, tmp, static_cast<uint64_t>(kPow10[kappa]) << -one.e, wp_w.f); + return; + } + } + + // kappa = 0 + for (;;) { + p2 *= 10; + delta *= 10; + char d = static_cast<char>(p2 >> -one.e); + if (d || *len) + buffer[(*len)++] = static_cast<char>('0' + d); + p2 &= one.f - 1; + kappa--; + if (p2 < delta) { + *K += kappa; + int index = -kappa; + GrisuRound(buffer, *len, delta, p2, one.f, wp_w.f * (index < 9 ? kPow10[index] : 0)); + return; + } + } +} + +inline void Grisu2(double value, char* buffer, int* length, int* K) { + const DiyFp v(value); + DiyFp w_m, w_p; + v.NormalizedBoundaries(&w_m, &w_p); + + const DiyFp c_mk = GetCachedPower(w_p.e, K); + const DiyFp W = v.Normalize() * c_mk; + DiyFp Wp = w_p * c_mk; + DiyFp Wm = w_m * c_mk; + Wm.f++; + Wp.f--; + DigitGen(W, Wp, Wp.f - Wm.f, buffer, length, K); +} + +inline char* WriteExponent(int K, char* buffer) { + if (K < 0) { + *buffer++ = '-'; + K = -K; + } + + if (K >= 100) { + *buffer++ = static_cast<char>('0' + static_cast<char>(K / 100)); + K %= 100; + const char* d = GetDigitsLut() + K * 2; + *buffer++ = d[0]; + *buffer++ = d[1]; + } + else if (K >= 10) { + const char* d = GetDigitsLut() + K * 2; + *buffer++ = d[0]; + *buffer++ = d[1]; + } + else + *buffer++ = static_cast<char>('0' + static_cast<char>(K)); + + return buffer; +} + +inline char* Prettify(char* buffer, int length, int k, int maxDecimalPlaces) { + const int kk = length + k; // 10^(kk-1) <= v < 10^kk + + if (0 <= k && kk <= 21) { + // 1234e7 -> 12340000000 + for (int i = length; i < kk; i++) + buffer[i] = '0'; + buffer[kk] = '.'; + buffer[kk + 1] = '0'; + return &buffer[kk + 2]; + } + else if (0 < kk && kk <= 21) { + // 1234e-2 -> 12.34 + std::memmove(&buffer[kk + 1], &buffer[kk], static_cast<size_t>(length - kk)); + buffer[kk] = '.'; + if (0 > k + maxDecimalPlaces) { + // When maxDecimalPlaces = 2, 1.2345 -> 1.23, 1.102 -> 1.1 + // Remove extra trailing zeros (at least one) after truncation. + for (int i = kk + maxDecimalPlaces; i > kk + 1; i--) + if (buffer[i] != '0') + return &buffer[i + 1]; + return &buffer[kk + 2]; // Reserve one zero + } + else + return &buffer[length + 1]; + } + else if (-6 < kk && kk <= 0) { + // 1234e-6 -> 0.001234 + const int offset = 2 - kk; + std::memmove(&buffer[offset], &buffer[0], static_cast<size_t>(length)); + buffer[0] = '0'; + buffer[1] = '.'; + for (int i = 2; i < offset; i++) + buffer[i] = '0'; + if (length - kk > maxDecimalPlaces) { + // When maxDecimalPlaces = 2, 0.123 -> 0.12, 0.102 -> 0.1 + // Remove extra trailing zeros (at least one) after truncation. + for (int i = maxDecimalPlaces + 1; i > 2; i--) + if (buffer[i] != '0') + return &buffer[i + 1]; + return &buffer[3]; // Reserve one zero + } + else + return &buffer[length + offset]; + } + else if (kk < -maxDecimalPlaces) { + // Truncate to zero + buffer[0] = '0'; + buffer[1] = '.'; + buffer[2] = '0'; + return &buffer[3]; + } + else if (length == 1) { + // 1e30 + buffer[1] = 'e'; + return WriteExponent(kk - 1, &buffer[2]); + } + else { + // 1234e30 -> 1.234e33 + std::memmove(&buffer[2], &buffer[1], static_cast<size_t>(length - 1)); + buffer[1] = '.'; + buffer[length + 1] = 'e'; + return WriteExponent(kk - 1, &buffer[0 + length + 2]); + } +} + +inline char* dtoa(double value, char* buffer, int maxDecimalPlaces = 324) { + RAPIDJSON_ASSERT(maxDecimalPlaces >= 1); + Double d(value); + if (d.IsZero()) { + if (d.Sign()) + *buffer++ = '-'; // -0.0, Issue #289 + buffer[0] = '0'; + buffer[1] = '.'; + buffer[2] = '0'; + return &buffer[3]; + } + else { + if (value < 0) { + *buffer++ = '-'; + value = -value; + } + int length, K; + Grisu2(value, buffer, &length, &K); + return Prettify(buffer, length, K, maxDecimalPlaces); + } +} + +#ifdef __GNUC__ +RAPIDJSON_DIAG_POP +#endif + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_DTOA_ diff --git a/contrib/libs/rapidjson/include/rapidjson/internal/itoa.h b/contrib/libs/rapidjson/include/rapidjson/internal/itoa.h new file mode 100644 index 0000000000..01a4e7e72d --- /dev/null +++ b/contrib/libs/rapidjson/include/rapidjson/internal/itoa.h @@ -0,0 +1,304 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_ITOA_ +#define RAPIDJSON_ITOA_ + +#include "../rapidjson.h" + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +inline const char* GetDigitsLut() { + static const char cDigitsLut[200] = { + '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9', + '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9', + '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9', + '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9', + '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9', + '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9', + '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9', + '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9', + '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9', + '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9' + }; + return cDigitsLut; +} + +inline char* u32toa(uint32_t value, char* buffer) { + const char* cDigitsLut = GetDigitsLut(); + + if (value < 10000) { + const uint32_t d1 = (value / 100) << 1; + const uint32_t d2 = (value % 100) << 1; + + if (value >= 1000) + *buffer++ = cDigitsLut[d1]; + if (value >= 100) + *buffer++ = cDigitsLut[d1 + 1]; + if (value >= 10) + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + } + else if (value < 100000000) { + // value = bbbbcccc + const uint32_t b = value / 10000; + const uint32_t c = value % 10000; + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + if (value >= 10000000) + *buffer++ = cDigitsLut[d1]; + if (value >= 1000000) + *buffer++ = cDigitsLut[d1 + 1]; + if (value >= 100000) + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + + *buffer++ = cDigitsLut[d3]; + *buffer++ = cDigitsLut[d3 + 1]; + *buffer++ = cDigitsLut[d4]; + *buffer++ = cDigitsLut[d4 + 1]; + } + else { + // value = aabbbbcccc in decimal + + const uint32_t a = value / 100000000; // 1 to 42 + value %= 100000000; + + if (a >= 10) { + const unsigned i = a << 1; + *buffer++ = cDigitsLut[i]; + *buffer++ = cDigitsLut[i + 1]; + } + else + *buffer++ = static_cast<char>('0' + static_cast<char>(a)); + + const uint32_t b = value / 10000; // 0 to 9999 + const uint32_t c = value % 10000; // 0 to 9999 + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + *buffer++ = cDigitsLut[d1]; + *buffer++ = cDigitsLut[d1 + 1]; + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + *buffer++ = cDigitsLut[d3]; + *buffer++ = cDigitsLut[d3 + 1]; + *buffer++ = cDigitsLut[d4]; + *buffer++ = cDigitsLut[d4 + 1]; + } + return buffer; +} + +inline char* i32toa(int32_t value, char* buffer) { + uint32_t u = static_cast<uint32_t>(value); + if (value < 0) { + *buffer++ = '-'; + u = ~u + 1; + } + + return u32toa(u, buffer); +} + +inline char* u64toa(uint64_t value, char* buffer) { + const char* cDigitsLut = GetDigitsLut(); + const uint64_t kTen8 = 100000000; + const uint64_t kTen9 = kTen8 * 10; + const uint64_t kTen10 = kTen8 * 100; + const uint64_t kTen11 = kTen8 * 1000; + const uint64_t kTen12 = kTen8 * 10000; + const uint64_t kTen13 = kTen8 * 100000; + const uint64_t kTen14 = kTen8 * 1000000; + const uint64_t kTen15 = kTen8 * 10000000; + const uint64_t kTen16 = kTen8 * kTen8; + + if (value < kTen8) { + uint32_t v = static_cast<uint32_t>(value); + if (v < 10000) { + const uint32_t d1 = (v / 100) << 1; + const uint32_t d2 = (v % 100) << 1; + + if (v >= 1000) + *buffer++ = cDigitsLut[d1]; + if (v >= 100) + *buffer++ = cDigitsLut[d1 + 1]; + if (v >= 10) + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + } + else { + // value = bbbbcccc + const uint32_t b = v / 10000; + const uint32_t c = v % 10000; + + const uint32_t d1 = (b / 100) << 1; + const uint32_t d2 = (b % 100) << 1; + + const uint32_t d3 = (c / 100) << 1; + const uint32_t d4 = (c % 100) << 1; + + if (value >= 10000000) + *buffer++ = cDigitsLut[d1]; + if (value >= 1000000) + *buffer++ = cDigitsLut[d1 + 1]; + if (value >= 100000) + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + + *buffer++ = cDigitsLut[d3]; + *buffer++ = cDigitsLut[d3 + 1]; + *buffer++ = cDigitsLut[d4]; + *buffer++ = cDigitsLut[d4 + 1]; + } + } + else if (value < kTen16) { + const uint32_t v0 = static_cast<uint32_t>(value / kTen8); + const uint32_t v1 = static_cast<uint32_t>(value % kTen8); + + const uint32_t b0 = v0 / 10000; + const uint32_t c0 = v0 % 10000; + + const uint32_t d1 = (b0 / 100) << 1; + const uint32_t d2 = (b0 % 100) << 1; + + const uint32_t d3 = (c0 / 100) << 1; + const uint32_t d4 = (c0 % 100) << 1; + + const uint32_t b1 = v1 / 10000; + const uint32_t c1 = v1 % 10000; + + const uint32_t d5 = (b1 / 100) << 1; + const uint32_t d6 = (b1 % 100) << 1; + + const uint32_t d7 = (c1 / 100) << 1; + const uint32_t d8 = (c1 % 100) << 1; + + if (value >= kTen15) + *buffer++ = cDigitsLut[d1]; + if (value >= kTen14) + *buffer++ = cDigitsLut[d1 + 1]; + if (value >= kTen13) + *buffer++ = cDigitsLut[d2]; + if (value >= kTen12) + *buffer++ = cDigitsLut[d2 + 1]; + if (value >= kTen11) + *buffer++ = cDigitsLut[d3]; + if (value >= kTen10) + *buffer++ = cDigitsLut[d3 + 1]; + if (value >= kTen9) + *buffer++ = cDigitsLut[d4]; + if (value >= kTen8) + *buffer++ = cDigitsLut[d4 + 1]; + + *buffer++ = cDigitsLut[d5]; + *buffer++ = cDigitsLut[d5 + 1]; + *buffer++ = cDigitsLut[d6]; + *buffer++ = cDigitsLut[d6 + 1]; + *buffer++ = cDigitsLut[d7]; + *buffer++ = cDigitsLut[d7 + 1]; + *buffer++ = cDigitsLut[d8]; + *buffer++ = cDigitsLut[d8 + 1]; + } + else { + const uint32_t a = static_cast<uint32_t>(value / kTen16); // 1 to 1844 + value %= kTen16; + + if (a < 10) + *buffer++ = static_cast<char>('0' + static_cast<char>(a)); + else if (a < 100) { + const uint32_t i = a << 1; + *buffer++ = cDigitsLut[i]; + *buffer++ = cDigitsLut[i + 1]; + } + else if (a < 1000) { + *buffer++ = static_cast<char>('0' + static_cast<char>(a / 100)); + + const uint32_t i = (a % 100) << 1; + *buffer++ = cDigitsLut[i]; + *buffer++ = cDigitsLut[i + 1]; + } + else { + const uint32_t i = (a / 100) << 1; + const uint32_t j = (a % 100) << 1; + *buffer++ = cDigitsLut[i]; + *buffer++ = cDigitsLut[i + 1]; + *buffer++ = cDigitsLut[j]; + *buffer++ = cDigitsLut[j + 1]; + } + + const uint32_t v0 = static_cast<uint32_t>(value / kTen8); + const uint32_t v1 = static_cast<uint32_t>(value % kTen8); + + const uint32_t b0 = v0 / 10000; + const uint32_t c0 = v0 % 10000; + + const uint32_t d1 = (b0 / 100) << 1; + const uint32_t d2 = (b0 % 100) << 1; + + const uint32_t d3 = (c0 / 100) << 1; + const uint32_t d4 = (c0 % 100) << 1; + + const uint32_t b1 = v1 / 10000; + const uint32_t c1 = v1 % 10000; + + const uint32_t d5 = (b1 / 100) << 1; + const uint32_t d6 = (b1 % 100) << 1; + + const uint32_t d7 = (c1 / 100) << 1; + const uint32_t d8 = (c1 % 100) << 1; + + *buffer++ = cDigitsLut[d1]; + *buffer++ = cDigitsLut[d1 + 1]; + *buffer++ = cDigitsLut[d2]; + *buffer++ = cDigitsLut[d2 + 1]; + *buffer++ = cDigitsLut[d3]; + *buffer++ = cDigitsLut[d3 + 1]; + *buffer++ = cDigitsLut[d4]; + *buffer++ = cDigitsLut[d4 + 1]; + *buffer++ = cDigitsLut[d5]; + *buffer++ = cDigitsLut[d5 + 1]; + *buffer++ = cDigitsLut[d6]; + *buffer++ = cDigitsLut[d6 + 1]; + *buffer++ = cDigitsLut[d7]; + *buffer++ = cDigitsLut[d7 + 1]; + *buffer++ = cDigitsLut[d8]; + *buffer++ = cDigitsLut[d8 + 1]; + } + + return buffer; +} + +inline char* i64toa(int64_t value, char* buffer) { + uint64_t u = static_cast<uint64_t>(value); + if (value < 0) { + *buffer++ = '-'; + u = ~u + 1; + } + + return u64toa(u, buffer); +} + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_ITOA_ diff --git a/contrib/libs/rapidjson/include/rapidjson/internal/strfunc.h b/contrib/libs/rapidjson/include/rapidjson/internal/strfunc.h new file mode 100644 index 0000000000..226439a767 --- /dev/null +++ b/contrib/libs/rapidjson/include/rapidjson/internal/strfunc.h @@ -0,0 +1,69 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_INTERNAL_STRFUNC_H_ +#define RAPIDJSON_INTERNAL_STRFUNC_H_ + +#include "../stream.h" +#include <cwchar> + +RAPIDJSON_NAMESPACE_BEGIN +namespace internal { + +//! Custom strlen() which works on different character types. +/*! \tparam Ch Character type (e.g. char, wchar_t, short) + \param s Null-terminated input string. + \return Number of characters in the string. + \note This has the same semantics as strlen(), the return value is not number of Unicode codepoints. +*/ +template <typename Ch> +inline SizeType StrLen(const Ch* s) { + RAPIDJSON_ASSERT(s != 0); + const Ch* p = s; + while (*p) ++p; + return SizeType(p - s); +} + +template <> +inline SizeType StrLen(const char* s) { + return SizeType(std::strlen(s)); +} + +template <> +inline SizeType StrLen(const wchar_t* s) { + return SizeType(std::wcslen(s)); +} + +//! Returns number of code points in a encoded string. +template<typename Encoding> +bool CountStringCodePoint(const typename Encoding::Ch* s, SizeType length, SizeType* outCount) { + RAPIDJSON_ASSERT(s != 0); + RAPIDJSON_ASSERT(outCount != 0); + GenericStringStream<Encoding> is(s); + const typename Encoding::Ch* end = s + length; + SizeType count = 0; + while (is.src_ < end) { + unsigned codepoint; + if (!Encoding::Decode(is, &codepoint)) + return false; + count++; + } + *outCount = count; + return true; +} + +} // namespace internal +RAPIDJSON_NAMESPACE_END + +#endif // RAPIDJSON_INTERNAL_STRFUNC_H_ diff --git a/contrib/libs/rapidjson/include/rapidjson/stringbuffer.h b/contrib/libs/rapidjson/include/rapidjson/stringbuffer.h new file mode 100644 index 0000000000..4e38b82c3d --- /dev/null +++ b/contrib/libs/rapidjson/include/rapidjson/stringbuffer.h @@ -0,0 +1,121 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_STRINGBUFFER_H_ +#define RAPIDJSON_STRINGBUFFER_H_ + +#include "stream.h" +#include "internal/stack.h" + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS +#include <utility> // std::move +#endif + +#include "internal/stack.h" + +#if defined(__clang__) +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(c++98-compat) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +//! Represents an in-memory output stream. +/*! + \tparam Encoding Encoding of the stream. + \tparam Allocator type for allocating memory buffer. + \note implements Stream concept +*/ +template <typename Encoding, typename Allocator = CrtAllocator> +class GenericStringBuffer { +public: + typedef typename Encoding::Ch Ch; + + GenericStringBuffer(Allocator* allocator = 0, size_t capacity = kDefaultCapacity) : stack_(allocator, capacity) {} + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + GenericStringBuffer(GenericStringBuffer&& rhs) : stack_(std::move(rhs.stack_)) {} + GenericStringBuffer& operator=(GenericStringBuffer&& rhs) { + if (&rhs != this) + stack_ = std::move(rhs.stack_); + return *this; + } +#endif + + void Put(Ch c) { *stack_.template Push<Ch>() = c; } + void PutUnsafe(Ch c) { *stack_.template PushUnsafe<Ch>() = c; } + void Flush() {} + + void Clear() { stack_.Clear(); } + void ShrinkToFit() { + // Push and pop a null terminator. This is safe. + *stack_.template Push<Ch>() = '\0'; + stack_.ShrinkToFit(); + stack_.template Pop<Ch>(1); + } + + void Reserve(size_t count) { stack_.template Reserve<Ch>(count); } + Ch* Push(size_t count) { return stack_.template Push<Ch>(count); } + Ch* PushUnsafe(size_t count) { return stack_.template PushUnsafe<Ch>(count); } + void Pop(size_t count) { stack_.template Pop<Ch>(count); } + + const Ch* GetString() const { + // Push and pop a null terminator. This is safe. + *stack_.template Push<Ch>() = '\0'; + stack_.template Pop<Ch>(1); + + return stack_.template Bottom<Ch>(); + } + + //! Get the size of string in bytes in the string buffer. + size_t GetSize() const { return stack_.GetSize(); } + + //! Get the length of string in Ch in the string buffer. + size_t GetLength() const { return stack_.GetSize() / sizeof(Ch); } + + static const size_t kDefaultCapacity = 256; + mutable internal::Stack<Allocator> stack_; + +private: + // Prohibit copy constructor & assignment operator. + GenericStringBuffer(const GenericStringBuffer&); + GenericStringBuffer& operator=(const GenericStringBuffer&); +}; + +//! String buffer with UTF8 encoding +typedef GenericStringBuffer<UTF8<> > StringBuffer; + +template<typename Encoding, typename Allocator> +inline void PutReserve(GenericStringBuffer<Encoding, Allocator>& stream, size_t count) { + stream.Reserve(count); +} + +template<typename Encoding, typename Allocator> +inline void PutUnsafe(GenericStringBuffer<Encoding, Allocator>& stream, typename Encoding::Ch c) { + stream.PutUnsafe(c); +} + +//! Implement specialized version of PutN() with memset() for better performance. +template<> +inline void PutN(GenericStringBuffer<UTF8<> >& stream, char c, size_t n) { + std::memset(stream.stack_.Push<char>(n), c, n * sizeof(c)); +} + +RAPIDJSON_NAMESPACE_END + +#if defined(__clang__) +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_STRINGBUFFER_H_ diff --git a/contrib/libs/rapidjson/include/rapidjson/writer.h b/contrib/libs/rapidjson/include/rapidjson/writer.h new file mode 100644 index 0000000000..68e14d9fd2 --- /dev/null +++ b/contrib/libs/rapidjson/include/rapidjson/writer.h @@ -0,0 +1,640 @@ +// Tencent is pleased to support the open source community by making RapidJSON available. +// +// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved. +// +// Licensed under the MIT License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://opensource.org/licenses/MIT +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef RAPIDJSON_WRITER_H_ +#define RAPIDJSON_WRITER_H_ + +#include "stream.h" +#include "internal/meta.h" +#include "internal/stack.h" +#include "internal/strfunc.h" +#include "internal/dtoa.h" +#include "internal/itoa.h" +#include "stringbuffer.h" +#include <new> // placement new + +#if defined(RAPIDJSON_SIMD) && defined(_MSC_VER) +#include <intrin.h> +#pragma intrinsic(_BitScanForward) +#endif +#ifdef RAPIDJSON_SSE42 +#include <nmmintrin.h> +#elif defined(RAPIDJSON_SSE2) +#include <emmintrin.h> +#endif + +#ifdef _MSC_VER +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(4127) // conditional expression is constant +#endif + +#ifdef __clang__ +RAPIDJSON_DIAG_PUSH +RAPIDJSON_DIAG_OFF(padded) +RAPIDJSON_DIAG_OFF(unreachable-code) +RAPIDJSON_DIAG_OFF(c++98-compat) +#endif + +RAPIDJSON_NAMESPACE_BEGIN + +/////////////////////////////////////////////////////////////////////////////// +// WriteFlag + +/*! \def RAPIDJSON_WRITE_DEFAULT_FLAGS + \ingroup RAPIDJSON_CONFIG + \brief User-defined kWriteDefaultFlags definition. + + User can define this as any \c WriteFlag combinations. +*/ +#ifndef RAPIDJSON_WRITE_DEFAULT_FLAGS +#define RAPIDJSON_WRITE_DEFAULT_FLAGS kWriteNoFlags +#endif + +//! Combination of writeFlags +enum WriteFlag { + kWriteNoFlags = 0, //!< No flags are set. + kWriteValidateEncodingFlag = 1, //!< Validate encoding of JSON strings. + kWriteNanAndInfFlag = 2, //!< Allow writing of Infinity, -Infinity and NaN. + kWriteNoEscapeSlashFlag = 4, //!< Disable escaping of '/'. + kWriteDefaultFlags = RAPIDJSON_WRITE_DEFAULT_FLAGS //!< Default write flags. Can be customized by defining RAPIDJSON_WRITE_DEFAULT_FLAGS +}; + +//! JSON writer +/*! Writer implements the concept Handler. + It generates JSON text by events to an output os. + + User may programmatically calls the functions of a writer to generate JSON text. + + On the other side, a writer can also be passed to objects that generates events, + + for example Reader::Parse() and Document::Accept(). + + \tparam OutputStream Type of output stream. + \tparam SourceEncoding Encoding of source string. + \tparam TargetEncoding Encoding of output stream. + \tparam StackAllocator Type of allocator for allocating memory of stack. + \note implements Handler concept +*/ +template<typename OutputStream, typename SourceEncoding = UTF8<>, typename TargetEncoding = UTF8<>, typename StackAllocator = CrtAllocator, unsigned writeFlags = kWriteDefaultFlags> +class Writer { +public: + typedef typename SourceEncoding::Ch Ch; + + static const int kDefaultMaxDecimalPlaces = 324; + + //! Constructor + /*! \param os Output stream. + \param stackAllocator User supplied allocator. If it is null, it will create a private one. + \param levelDepth Initial capacity of stack. + */ + explicit + Writer(OutputStream& os, StackAllocator* stackAllocator = 0, size_t levelDepth = kDefaultLevelDepth) : + os_(&os), level_stack_(stackAllocator, levelDepth * sizeof(Level)), maxDecimalPlaces_(kDefaultMaxDecimalPlaces), hasRoot_(false) {} + + explicit + Writer(StackAllocator* allocator = 0, size_t levelDepth = kDefaultLevelDepth) : + os_(0), level_stack_(allocator, levelDepth * sizeof(Level)), maxDecimalPlaces_(kDefaultMaxDecimalPlaces), hasRoot_(false) {} + +#if RAPIDJSON_HAS_CXX11_RVALUE_REFS + Writer(Writer&& rhs) : + os_(rhs.os_), level_stack_(std::move(rhs.level_stack_)), maxDecimalPlaces_(rhs.maxDecimalPlaces_), hasRoot_(rhs.hasRoot_) { + rhs.os_ = 0; + } +#endif + + //! Reset the writer with a new stream. + /*! + This function reset the writer with a new stream and default settings, + in order to make a Writer object reusable for output multiple JSONs. + + \param os New output stream. + \code + Writer<OutputStream> writer(os1); + writer.StartObject(); + // ... + writer.EndObject(); + + writer.Reset(os2); + writer.StartObject(); + // ... + writer.EndObject(); + \endcode + */ + void Reset(OutputStream& os) { + os_ = &os; + hasRoot_ = false; + level_stack_.Clear(); + } + + //! Checks whether the output is a complete JSON. + /*! + A complete JSON has a complete root object or array. + */ + bool IsComplete() const { + return hasRoot_ && level_stack_.Empty(); + } + + int GetMaxDecimalPlaces() const { + return maxDecimalPlaces_; + } + + //! Sets the maximum number of decimal places for double output. + /*! + This setting truncates the output with specified number of decimal places. + + For example, + + \code + writer.SetMaxDecimalPlaces(3); + writer.StartArray(); + writer.Double(0.12345); // "0.123" + writer.Double(0.0001); // "0.0" + writer.Double(1.234567890123456e30); // "1.234567890123456e30" (do not truncate significand for positive exponent) + writer.Double(1.23e-4); // "0.0" (do truncate significand for negative exponent) + writer.EndArray(); + \endcode + + The default setting does not truncate any decimal places. You can restore to this setting by calling + \code + writer.SetMaxDecimalPlaces(Writer::kDefaultMaxDecimalPlaces); + \endcode + */ + void SetMaxDecimalPlaces(int maxDecimalPlaces) { + maxDecimalPlaces_ = maxDecimalPlaces; + } + + /*!@name Implementation of Handler + \see Handler + */ + //@{ + + bool Null() { Prefix(kNullType); return EndValue(WriteNull()); } + bool Bool(bool b) { Prefix(b ? kTrueType : kFalseType); return EndValue(WriteBool(b)); } + bool Int(int i) { Prefix(kNumberType); return EndValue(WriteInt(i)); } + bool Uint(unsigned u) { Prefix(kNumberType); return EndValue(WriteUint(u)); } + bool Int64(int64_t i64) { Prefix(kNumberType); return EndValue(WriteInt64(i64)); } + bool Uint64(uint64_t u64) { Prefix(kNumberType); return EndValue(WriteUint64(u64)); } + + //! Writes the given \c double value to the stream + /*! + \param d The value to be written. + \return Whether it is succeed. + */ + bool Double(double d) { Prefix(kNumberType); return EndValue(WriteDouble(d)); } + + bool RawNumber(const Ch* str, SizeType length, bool copy = false) { + RAPIDJSON_ASSERT(str != 0); + (void)copy; + Prefix(kNumberType); + return EndValue(WriteString(str, length)); + } + + bool String(const Ch* str, SizeType length, bool copy = false) { + RAPIDJSON_ASSERT(str != 0); + (void)copy; + Prefix(kStringType); + return EndValue(WriteString(str, length)); + } + +#if RAPIDJSON_HAS_STDSTRING + bool String(const std::basic_string<Ch>& str) { + return String(str.data(), SizeType(str.size())); + } +#endif + + bool StartObject() { + Prefix(kObjectType); + new (level_stack_.template Push<Level>()) Level(false); + return WriteStartObject(); + } + + bool Key(const Ch* str, SizeType length, bool copy = false) { return String(str, length, copy); } + + bool EndObject(SizeType memberCount = 0) { + (void)memberCount; + RAPIDJSON_ASSERT(level_stack_.GetSize() >= sizeof(Level)); // not inside an Object + RAPIDJSON_ASSERT(!level_stack_.template Top<Level>()->inArray); // currently inside an Array, not Object + RAPIDJSON_ASSERT(0 == level_stack_.template Top<Level>()->valueCount % 2); // Object has a Key without a Value + level_stack_.template Pop<Level>(1); + return EndValue(WriteEndObject()); + } + + bool StartArray() { + Prefix(kArrayType); + new (level_stack_.template Push<Level>()) Level(true); + return WriteStartArray(); + } + + bool EndArray(SizeType elementCount = 0) { + (void)elementCount; + RAPIDJSON_ASSERT(level_stack_.GetSize() >= sizeof(Level)); + RAPIDJSON_ASSERT(level_stack_.template Top<Level>()->inArray); + level_stack_.template Pop<Level>(1); + return EndValue(WriteEndArray()); + } + //@} + + /*! @name Convenience extensions */ + //@{ + + //! Simpler but slower overload. + bool String(const Ch* const& str) { return String(str, internal::StrLen(str)); } + bool Key(const Ch* const& str) { return Key(str, internal::StrLen(str)); } + + //@} + + //! Write a raw JSON value. + /*! + For user to write a stringified JSON as a value. + + \param json A well-formed JSON value. It should not contain null character within [0, length - 1] range. + \param length Length of the json. + \param type Type of the root of json. + */ + bool RawValue(const Ch* json, size_t length, Type type) { + RAPIDJSON_ASSERT(json != 0); + Prefix(type); + return EndValue(WriteRawValue(json, length)); + } + + //! Flush the output stream. + /*! + Allows the user to flush the output stream immediately. + */ + void Flush() { + os_->Flush(); + } + +protected: + //! Information for each nested level + struct Level { + Level(bool inArray_) : valueCount(0), inArray(inArray_) {} + size_t valueCount; //!< number of values in this level + bool inArray; //!< true if in array, otherwise in object + }; + + static const size_t kDefaultLevelDepth = 32; + + bool WriteNull() { + PutReserve(*os_, 4); + PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'u'); PutUnsafe(*os_, 'l'); PutUnsafe(*os_, 'l'); return true; + } + + bool WriteBool(bool b) { + if (b) { + PutReserve(*os_, 4); + PutUnsafe(*os_, 't'); PutUnsafe(*os_, 'r'); PutUnsafe(*os_, 'u'); PutUnsafe(*os_, 'e'); + } + else { + PutReserve(*os_, 5); + PutUnsafe(*os_, 'f'); PutUnsafe(*os_, 'a'); PutUnsafe(*os_, 'l'); PutUnsafe(*os_, 's'); PutUnsafe(*os_, 'e'); + } + return true; + } + + bool WriteInt(int i) { + char buffer[11]; + const char* end = internal::i32toa(i, buffer); + PutReserve(*os_, static_cast<size_t>(end - buffer)); + for (const char* p = buffer; p != end; ++p) + PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(*p)); + return true; + } + + bool WriteUint(unsigned u) { + char buffer[10]; + const char* end = internal::u32toa(u, buffer); + PutReserve(*os_, static_cast<size_t>(end - buffer)); + for (const char* p = buffer; p != end; ++p) + PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(*p)); + return true; + } + + bool WriteInt64(int64_t i64) { + char buffer[21]; + const char* end = internal::i64toa(i64, buffer); + PutReserve(*os_, static_cast<size_t>(end - buffer)); + for (const char* p = buffer; p != end; ++p) + PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(*p)); + return true; + } + + bool WriteUint64(uint64_t u64) { + char buffer[20]; + char* end = internal::u64toa(u64, buffer); + PutReserve(*os_, static_cast<size_t>(end - buffer)); + for (char* p = buffer; p != end; ++p) + PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(*p)); + return true; + } + + bool WriteDouble(double d) { + if (internal::Double(d).IsNanOrInf()) { + if (!(writeFlags & kWriteNanAndInfFlag)) + return false; + if (internal::Double(d).IsNan()) { + PutReserve(*os_, 3); + PutUnsafe(*os_, 'N'); PutUnsafe(*os_, 'a'); PutUnsafe(*os_, 'N'); + return true; + } + if (internal::Double(d).Sign()) { + PutReserve(*os_, 9); + PutUnsafe(*os_, '-'); + } + else + PutReserve(*os_, 8); + PutUnsafe(*os_, 'I'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'f'); + PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 't'); PutUnsafe(*os_, 'y'); + return true; + } + + char buffer[25]; + char* end = internal::dtoa(d, buffer, maxDecimalPlaces_); + PutReserve(*os_, static_cast<size_t>(end - buffer)); + for (char* p = buffer; p != end; ++p) + PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(*p)); + return true; + } + + bool WriteString(const Ch* str, SizeType length) { + static const typename OutputStream::Ch hexDigits[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; + static const char escape[256] = { +#define Z16 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + //0 1 2 3 4 5 6 7 8 9 A B C D E F + 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'b', 't', 'n', 'u', 'f', 'r', 'u', 'u', // 00 + 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', // 10 + 0, 0, '"', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20 + Z16, Z16, // 30~4F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,'\\', 0, 0, 0, // 50 + Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16, Z16 // 60~FF +#undef Z16 + }; + + if (TargetEncoding::supportUnicode) + PutReserve(*os_, 2 + length * 6); // "\uxxxx..." + else + PutReserve(*os_, 2 + length * 12); // "\uxxxx\uyyyy..." + + PutUnsafe(*os_, '\"'); + GenericStringStream<SourceEncoding> is(str); + while (ScanWriteUnescapedString(is, length)) { + const Ch c = is.Peek(); + if (!TargetEncoding::supportUnicode && static_cast<unsigned>(c) >= 0x80) { + // Unicode escaping + unsigned codepoint; + if (RAPIDJSON_UNLIKELY(!SourceEncoding::Decode(is, &codepoint))) + return false; + PutUnsafe(*os_, '\\'); + PutUnsafe(*os_, 'u'); + if (codepoint <= 0xD7FF || (codepoint >= 0xE000 && codepoint <= 0xFFFF)) { + PutUnsafe(*os_, hexDigits[(codepoint >> 12) & 15]); + PutUnsafe(*os_, hexDigits[(codepoint >> 8) & 15]); + PutUnsafe(*os_, hexDigits[(codepoint >> 4) & 15]); + PutUnsafe(*os_, hexDigits[(codepoint ) & 15]); + } + else { + RAPIDJSON_ASSERT(codepoint >= 0x010000 && codepoint <= 0x10FFFF); + // Surrogate pair + unsigned s = codepoint - 0x010000; + unsigned lead = (s >> 10) + 0xD800; + unsigned trail = (s & 0x3FF) + 0xDC00; + PutUnsafe(*os_, hexDigits[(lead >> 12) & 15]); + PutUnsafe(*os_, hexDigits[(lead >> 8) & 15]); + PutUnsafe(*os_, hexDigits[(lead >> 4) & 15]); + PutUnsafe(*os_, hexDigits[(lead ) & 15]); + PutUnsafe(*os_, '\\'); + PutUnsafe(*os_, 'u'); + PutUnsafe(*os_, hexDigits[(trail >> 12) & 15]); + PutUnsafe(*os_, hexDigits[(trail >> 8) & 15]); + PutUnsafe(*os_, hexDigits[(trail >> 4) & 15]); + PutUnsafe(*os_, hexDigits[(trail ) & 15]); + } + } + else if ((sizeof(Ch) == 1 || static_cast<unsigned>(c) < 256) && RAPIDJSON_UNLIKELY(escape[static_cast<unsigned char>(c)])) { + is.Take(); + PutUnsafe(*os_, '\\'); + PutUnsafe(*os_, static_cast<typename OutputStream::Ch>(escape[static_cast<unsigned char>(c)])); + if (escape[static_cast<unsigned char>(c)] == 'u') { + PutUnsafe(*os_, '0'); + PutUnsafe(*os_, '0'); + PutUnsafe(*os_, hexDigits[static_cast<unsigned char>(c) >> 4]); + PutUnsafe(*os_, hexDigits[static_cast<unsigned char>(c) & 0xF]); + } + } + else if (RAPIDJSON_UNLIKELY(c == '/' && !(writeFlags & kWriteNoEscapeSlashFlag))) { + is.Take(); + PutUnsafe(*os_, '\\'); + PutUnsafe(*os_, '/'); + } + else if (RAPIDJSON_UNLIKELY(!(writeFlags & kWriteValidateEncodingFlag ? + Transcoder<SourceEncoding, TargetEncoding>::Validate(is, *os_) : + Transcoder<SourceEncoding, TargetEncoding>::TranscodeUnsafe(is, *os_)))) + return false; + } + PutUnsafe(*os_, '\"'); + return true; + } + + bool ScanWriteUnescapedString(GenericStringStream<SourceEncoding>& is, size_t length) { + return RAPIDJSON_LIKELY(is.Tell() < length); + } + + bool WriteStartObject() { os_->Put('{'); return true; } + bool WriteEndObject() { os_->Put('}'); return true; } + bool WriteStartArray() { os_->Put('['); return true; } + bool WriteEndArray() { os_->Put(']'); return true; } + + bool WriteRawValue(const Ch* json, size_t length) { + PutReserve(*os_, length); + for (size_t i = 0; i < length; i++) { + RAPIDJSON_ASSERT(json[i] != '\0'); + PutUnsafe(*os_, json[i]); + } + return true; + } + + void Prefix(Type type) { + (void)type; + if (RAPIDJSON_LIKELY(level_stack_.GetSize() != 0)) { // this value is not at root + Level* level = level_stack_.template Top<Level>(); + if (level->valueCount > 0) { + if (level->inArray) + os_->Put(','); // add comma if it is not the first element in array + else // in object + os_->Put((level->valueCount % 2 == 0) ? ',' : ':'); + } + if (!level->inArray && level->valueCount % 2 == 0) + RAPIDJSON_ASSERT(type == kStringType); // if it's in object, then even number should be a name + level->valueCount++; + } + else { + RAPIDJSON_ASSERT(!hasRoot_); // Should only has one and only one root. + hasRoot_ = true; + } + } + + // Flush the value if it is the top level one. + bool EndValue(bool ret) { + if (RAPIDJSON_UNLIKELY(level_stack_.Empty())) // end of json text + Flush(); + return ret; + } + + OutputStream* os_; + internal::Stack<StackAllocator> level_stack_; + int maxDecimalPlaces_; + bool hasRoot_; + +private: + // Prohibit copy constructor & assignment operator. + Writer(const Writer&); + Writer& operator=(const Writer&); +}; + +// Full specialization for StringStream to prevent memory copying + +template<> +inline bool Writer<StringBuffer>::WriteInt(int i) { + char *buffer = os_->Push(11); + const char* end = internal::i32toa(i, buffer); + os_->Pop(static_cast<size_t>(11 - (end - buffer))); + return true; +} + +template<> +inline bool Writer<StringBuffer>::WriteUint(unsigned u) { + char *buffer = os_->Push(10); + const char* end = internal::u32toa(u, buffer); + os_->Pop(static_cast<size_t>(10 - (end - buffer))); + return true; +} + +template<> +inline bool Writer<StringBuffer>::WriteInt64(int64_t i64) { + char *buffer = os_->Push(21); + const char* end = internal::i64toa(i64, buffer); + os_->Pop(static_cast<size_t>(21 - (end - buffer))); + return true; +} + +template<> +inline bool Writer<StringBuffer>::WriteUint64(uint64_t u) { + char *buffer = os_->Push(20); + const char* end = internal::u64toa(u, buffer); + os_->Pop(static_cast<size_t>(20 - (end - buffer))); + return true; +} + +template<> +inline bool Writer<StringBuffer>::WriteDouble(double d) { + if (internal::Double(d).IsNanOrInf()) { + // Note: This code path can only be reached if (RAPIDJSON_WRITE_DEFAULT_FLAGS & kWriteNanAndInfFlag). + if (!(kWriteDefaultFlags & kWriteNanAndInfFlag)) + return false; + if (internal::Double(d).IsNan()) { + PutReserve(*os_, 3); + PutUnsafe(*os_, 'N'); PutUnsafe(*os_, 'a'); PutUnsafe(*os_, 'N'); + return true; + } + if (internal::Double(d).Sign()) { + PutReserve(*os_, 9); + PutUnsafe(*os_, '-'); + } + else + PutReserve(*os_, 8); + PutUnsafe(*os_, 'I'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'f'); + PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 'n'); PutUnsafe(*os_, 'i'); PutUnsafe(*os_, 't'); PutUnsafe(*os_, 'y'); + return true; + } + + char *buffer = os_->Push(25); + char* end = internal::dtoa(d, buffer, maxDecimalPlaces_); + os_->Pop(static_cast<size_t>(25 - (end - buffer))); + return true; +} + +#if defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42) +template<> +inline bool Writer<StringBuffer>::ScanWriteUnescapedString(StringStream& is, size_t length) { + if (length < 16) + return RAPIDJSON_LIKELY(is.Tell() < length); + + if (!RAPIDJSON_LIKELY(is.Tell() < length)) + return false; + + const char* p = is.src_; + const char* end = is.head_ + length; + const char* nextAligned = reinterpret_cast<const char*>((reinterpret_cast<size_t>(p) + 15) & static_cast<size_t>(~15)); + const char* endAligned = reinterpret_cast<const char*>(reinterpret_cast<size_t>(end) & static_cast<size_t>(~15)); + if (nextAligned > end) + return true; + + while (p != nextAligned) + if (*p < 0x20 || *p == '\"' || *p == '\\') { + is.src_ = p; + return RAPIDJSON_LIKELY(is.Tell() < length); + } + else + os_->PutUnsafe(*p++); + + // The rest of string using SIMD + static const char dquote[16] = { '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"', '\"' }; + static const char bslash[16] = { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }; + static const char space[16] = { 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F }; + const __m128i dq = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&dquote[0])); + const __m128i bs = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&bslash[0])); + const __m128i sp = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&space[0])); + + for (; p != endAligned; p += 16) { + const __m128i s = _mm_load_si128(reinterpret_cast<const __m128i *>(p)); + const __m128i t1 = _mm_cmpeq_epi8(s, dq); + const __m128i t2 = _mm_cmpeq_epi8(s, bs); + const __m128i t3 = _mm_cmpeq_epi8(_mm_max_epu8(s, sp), sp); // s < 0x20 <=> max(s, 0x1F) == 0x1F + const __m128i x = _mm_or_si128(_mm_or_si128(t1, t2), t3); + unsigned short r = static_cast<unsigned short>(_mm_movemask_epi8(x)); + if (RAPIDJSON_UNLIKELY(r != 0)) { // some of characters is escaped + SizeType len; +#ifdef _MSC_VER // Find the index of first escaped + unsigned long offset; + _BitScanForward(&offset, r); + len = offset; +#else + len = static_cast<SizeType>(__builtin_ffs(r) - 1); +#endif + char* q = reinterpret_cast<char*>(os_->PushUnsafe(len)); + for (size_t i = 0; i < len; i++) + q[i] = p[i]; + + p += len; + break; + } + _mm_storeu_si128(reinterpret_cast<__m128i *>(os_->PushUnsafe(16)), s); + } + + is.src_ = p; + return RAPIDJSON_LIKELY(is.Tell() < length); +} +#endif // defined(RAPIDJSON_SSE2) || defined(RAPIDJSON_SSE42) + +RAPIDJSON_NAMESPACE_END + +#ifdef _MSC_VER +RAPIDJSON_DIAG_POP +#endif + +#ifdef __clang__ +RAPIDJSON_DIAG_POP +#endif + +#endif // RAPIDJSON_RAPIDJSON_H_ |