diff options
| author | robot-piglet <[email protected]> | 2025-08-14 11:26:15 +0300 |
|---|---|---|
| committer | robot-piglet <[email protected]> | 2025-08-14 12:06:36 +0300 |
| commit | dc2bf727ea4698fa382f0f8623a8854c4900e212 (patch) | |
| tree | a621e92060fd7560066f33a323b4b8aca34f1e36 /contrib/libs/apache/arrow_next/cpp/src/arrow/filesystem/filesystem.h | |
| parent | 322ee7d149464c6f18d6a330d937227cb022b9f3 (diff) | |
Intermediate changes
commit_hash:746e9b78ab4c78ba4f30511f1fa9330c0d56a406
Diffstat (limited to 'contrib/libs/apache/arrow_next/cpp/src/arrow/filesystem/filesystem.h')
| -rw-r--r-- | contrib/libs/apache/arrow_next/cpp/src/arrow/filesystem/filesystem.h | 724 |
1 files changed, 724 insertions, 0 deletions
diff --git a/contrib/libs/apache/arrow_next/cpp/src/arrow/filesystem/filesystem.h b/contrib/libs/apache/arrow_next/cpp/src/arrow/filesystem/filesystem.h new file mode 100644 index 00000000000..f3b9aaafafc --- /dev/null +++ b/contrib/libs/apache/arrow_next/cpp/src/arrow/filesystem/filesystem.h @@ -0,0 +1,724 @@ +#pragma clang system_header +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <chrono> +#include <cstdint> +#include <functional> +#include <iosfwd> +#include <memory> +#include <string> +#include <utility> +#include <vector> + +#include "arrow/filesystem/type_fwd.h" +#include "arrow/io/interfaces.h" +#include "arrow/type_fwd.h" +#include "arrow/util/compare.h" +#include "arrow/util/macros.h" +#include "arrow/util/type_fwd.h" +#include "arrow/util/visibility.h" +#include "arrow/util/windows_fixup.h" + +namespace arrow20 { +namespace fs { + +using arrow20::util::Uri; + +// A system clock time point expressed as a 64-bit (or more) number of +// nanoseconds since the epoch. +using TimePoint = + std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>; + +ARROW_EXPORT std::string ToString(FileType); + +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType); + +static const int64_t kNoSize = -1; +static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1)); + +/// \brief FileSystem entry info +struct ARROW_EXPORT FileInfo : public util::EqualityComparable<FileInfo> { + FileInfo() = default; + FileInfo(FileInfo&&) = default; + FileInfo& operator=(FileInfo&&) = default; + FileInfo(const FileInfo&) = default; + FileInfo& operator=(const FileInfo&) = default; + + explicit FileInfo(std::string path, FileType type = FileType::Unknown) + : path_(std::move(path)), type_(type) {} + + /// The file type + FileType type() const { return type_; } + void set_type(FileType type) { type_ = type; } + + /// The full file path in the filesystem + const std::string& path() const { return path_; } + void set_path(std::string path) { path_ = std::move(path); } + + /// The file base name (component after the last directory separator) + std::string base_name() const; + + // The directory base name (component before the file base name). + std::string dir_name() const; + + /// The size in bytes, if available + /// + /// Only regular files are guaranteed to have a size. + int64_t size() const { return size_; } + void set_size(int64_t size) { size_ = size; } + + /// The file extension (excluding the dot) + std::string extension() const; + + /// The time of last modification, if available + TimePoint mtime() const { return mtime_; } + void set_mtime(TimePoint mtime) { mtime_ = mtime; } + + bool IsFile() const { return type_ == FileType::File; } + bool IsDirectory() const { return type_ == FileType::Directory; } + + bool Equals(const FileInfo& other) const { + return type() == other.type() && path() == other.path() && size() == other.size() && + mtime() == other.mtime(); + } + + std::string ToString() const; + + /// Function object implementing less-than comparison and hashing by + /// path, to support sorting infos, using them as keys, and other + /// interactions with the STL. + struct ByPath { + bool operator()(const FileInfo& l, const FileInfo& r) const { + return l.path() < r.path(); + } + + size_t operator()(const FileInfo& i) const { + return std::hash<std::string>{}(i.path()); + } + }; + + protected: + std::string path_; + FileType type_ = FileType::Unknown; + int64_t size_ = kNoSize; + TimePoint mtime_ = kNoTime; +}; + +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const FileInfo&); + +/// \brief File selector for filesystem APIs +struct ARROW_EXPORT FileSelector { + /// The directory in which to select files. + /// If the path exists but doesn't point to a directory, this should be an error. + std::string base_dir; + /// The behavior if `base_dir` isn't found in the filesystem. If false, + /// an error is returned. If true, an empty selection is returned. + bool allow_not_found; + /// Whether to recurse into subdirectories. + bool recursive; + /// The maximum number of subdirectories to recurse into. + int32_t max_recursion; + + FileSelector() : allow_not_found(false), recursive(false), max_recursion(INT32_MAX) {} +}; + +/// \brief FileSystem, path pair +struct ARROW_EXPORT FileLocator { + std::shared_ptr<FileSystem> filesystem; + std::string path; +}; + +using FileInfoVector = std::vector<FileInfo>; +using FileInfoGenerator = std::function<Future<FileInfoVector>()>; + +} // namespace fs + +template <> +struct IterationTraits<fs::FileInfoVector> { + static fs::FileInfoVector End() { return {}; } + static bool IsEnd(const fs::FileInfoVector& val) { return val.empty(); } +}; + +namespace fs { + +/// \brief Abstract file system API +class ARROW_EXPORT FileSystem + /// \cond false + : public std::enable_shared_from_this<FileSystem> +/// \endcond +{ // NOLINT + public: + virtual ~FileSystem(); + + virtual std::string type_name() const = 0; + + /// EXPERIMENTAL: The IOContext associated with this filesystem. + const io::IOContext& io_context() const { return io_context_; } + + /// Normalize path for the given filesystem + /// + /// The default implementation of this method is a no-op, but subclasses + /// may allow normalizing irregular path forms (such as Windows local paths). + virtual Result<std::string> NormalizePath(std::string path); + + /// \brief Ensure a URI (or path) is compatible with the given filesystem and return the + /// path + /// + /// \param uri_string A URI representing a resource in the given filesystem. + /// + /// This method will check to ensure the given filesystem is compatible with the + /// URI. This can be useful when the user provides both a URI and a filesystem or + /// when a user provides multiple URIs that should be compatible with the same + /// filesystem. + /// + /// uri_string can be an absolute path instead of a URI. In that case it will ensure + /// the filesystem (if supplied) is the local filesystem (or some custom filesystem that + /// is capable of reading local paths) and will normalize the path's file separators. + /// + /// Note, this method only checks to ensure the URI scheme is valid. It will not detect + /// inconsistencies like a mismatching region or endpoint override. + /// + /// \return The path inside the filesystem that is indicated by the URI. + virtual Result<std::string> PathFromUri(const std::string& uri_string) const; + + /// \brief Make a URI from which FileSystemFromUri produces an equivalent filesystem + /// \param path The path component to use in the resulting URI + /// \return A URI string, or an error if an equivalent URI cannot be produced + virtual Result<std::string> MakeUri(std::string path) const; + + virtual bool Equals(const FileSystem& other) const = 0; + + virtual bool Equals(const std::shared_ptr<FileSystem>& other) const { + return Equals(*other); + } + + /// Get info for the given target. + /// + /// Any symlink is automatically dereferenced, recursively. + /// A nonexistent or unreachable file returns an Ok status and + /// has a FileType of value NotFound. An error status indicates + /// a truly exceptional condition (low-level I/O error, etc.). + virtual Result<FileInfo> GetFileInfo(const std::string& path) = 0; + /// Same, for many targets at once. + virtual Result<FileInfoVector> GetFileInfo(const std::vector<std::string>& paths); + /// Same, according to a selector. + /// + /// The selector's base directory will not be part of the results, even if + /// it exists. + /// If it doesn't exist, see `FileSelector::allow_not_found`. + virtual Result<FileInfoVector> GetFileInfo(const FileSelector& select) = 0; + + /// Async version of GetFileInfo + virtual Future<FileInfoVector> GetFileInfoAsync(const std::vector<std::string>& paths); + + /// Streaming async version of GetFileInfo + /// + /// The returned generator is not async-reentrant, i.e. you need to wait for + /// the returned future to complete before calling the generator again. + virtual FileInfoGenerator GetFileInfoGenerator(const FileSelector& select); + + /// Create a directory and subdirectories. + /// + /// This function succeeds if the directory already exists. + virtual Status CreateDir(const std::string& path, bool recursive) = 0; + Status CreateDir(const std::string& path) { return CreateDir(path, true); } + + /// Delete a directory and its contents, recursively. + virtual Status DeleteDir(const std::string& path) = 0; + + /// Delete a directory's contents, recursively. + /// + /// Like DeleteDir, but doesn't delete the directory itself. + /// Passing an empty path ("" or "/") is disallowed, see DeleteRootDirContents. + virtual Status DeleteDirContents(const std::string& path, bool missing_dir_ok) = 0; + Status DeleteDirContents(const std::string& path) { + return DeleteDirContents(path, false); + } + + /// Async version of DeleteDirContents. + virtual Future<> DeleteDirContentsAsync(const std::string& path, bool missing_dir_ok); + + /// Async version of DeleteDirContents. + /// + /// This overload allows missing directories. + Future<> DeleteDirContentsAsync(const std::string& path); + + /// EXPERIMENTAL: Delete the root directory's contents, recursively. + /// + /// Implementations may decide to raise an error if this operation is + /// too dangerous. + // NOTE: may decide to remove this if it's deemed not useful + virtual Status DeleteRootDirContents() = 0; + + /// Delete a file. + virtual Status DeleteFile(const std::string& path) = 0; + /// Delete many files. + /// + /// The default implementation issues individual delete operations in sequence. + virtual Status DeleteFiles(const std::vector<std::string>& paths); + + /// Move / rename a file or directory. + /// + /// If the destination exists: + /// - if it is a non-empty directory, an error is returned + /// - otherwise, if it has the same type as the source, it is replaced + /// - otherwise, behavior is unspecified (implementation-dependent). + virtual Status Move(const std::string& src, const std::string& dest) = 0; + + /// Copy a file. + /// + /// If the destination exists and is a directory, an error is returned. + /// Otherwise, it is replaced. + virtual Status CopyFile(const std::string& src, const std::string& dest) = 0; + + /// Open an input stream for sequential reading. + virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream( + const std::string& path) = 0; + + /// Open an input stream for sequential reading. + /// + /// This override assumes the given FileInfo validly represents the file's + /// characteristics, and may optimize access depending on them (for example + /// avoid querying the file size or its existence). + virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info); + + /// Open an input file for random access reading. + virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const std::string& path) = 0; + + /// Open an input file for random access reading. + /// + /// This override assumes the given FileInfo validly represents the file's + /// characteristics, and may optimize access depending on them (for example + /// avoid querying the file size or its existence). + virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const FileInfo& info); + + /// Async version of OpenInputStream + virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync( + const std::string& path); + + /// Async version of OpenInputStream + virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync( + const FileInfo& info); + + /// Async version of OpenInputFile + virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync( + const std::string& path); + + /// Async version of OpenInputFile + virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync( + const FileInfo& info); + + /// Open an output stream for sequential writing. + /// + /// If the target already exists, existing data is truncated. + virtual Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata) = 0; + Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(const std::string& path); + + /// Open an output stream for appending. + /// + /// If the target doesn't exist, a new empty file is created. + /// + /// Note: some filesystem implementations do not support efficient appending + /// to an existing file, in which case this method will return NotImplemented. + /// Consider writing to multiple files (using e.g. the dataset layer) instead. + virtual Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata) = 0; + Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(const std::string& path); + + protected: + explicit FileSystem(io::IOContext io_context = io::default_io_context()) + : io_context_(std::move(io_context)) {} + + io::IOContext io_context_; + // Whether metadata operations (such as GetFileInfo or OpenInputStream) + // are cheap enough that the default async variants don't bother with + // a thread pool. + bool default_async_is_sync_ = true; +}; + +struct FileSystemFactory { + std::function<Result<std::shared_ptr<FileSystem>>( + const Uri& uri, const io::IOContext& io_context, std::string* out_path)> + function; + std::string_view file; + int line; + + bool operator==(const FileSystemFactory& other) const { + // In the case where libarrow is linked statically both to the executable and to a + // dynamically loaded filesystem implementation library, the library contains a + // duplicate definition of the registry and duplicate definitions of any + // FileSystemRegistrars which are statically linked to libarrow. When retrieving + // factories from the filesystem implementation library, we use the file and line + // of the registrar's definition to determine equivalence of the duplicate factories. + return file == other.file && line == other.line; + } +}; + +/// \brief A FileSystem implementation that delegates to another +/// implementation after prepending a fixed base path. +/// +/// This is useful to expose a logical view of a subtree of a filesystem, +/// for example a directory in a LocalFileSystem. +/// This works on abstract paths, i.e. paths using forward slashes and +/// and a single root "/". Windows paths are not guaranteed to work. +/// This makes no security guarantee. For example, symlinks may allow to +/// "escape" the subtree and access other parts of the underlying filesystem. +class ARROW_EXPORT SubTreeFileSystem : public FileSystem { + public: + // This constructor may abort if base_path is invalid. + explicit SubTreeFileSystem(const std::string& base_path, + std::shared_ptr<FileSystem> base_fs); + ~SubTreeFileSystem() override; + + std::string type_name() const override { return "subtree"; } + std::string base_path() const { return base_path_; } + std::shared_ptr<FileSystem> base_fs() const { return base_fs_; } + + Result<std::string> NormalizePath(std::string path) override; + Result<std::string> PathFromUri(const std::string& uri_string) const override; + + bool Equals(const FileSystem& other) const override; + + /// \cond FALSE + using FileSystem::CreateDir; + using FileSystem::DeleteDirContents; + using FileSystem::GetFileInfo; + using FileSystem::OpenAppendStream; + using FileSystem::OpenOutputStream; + /// \endcond + + Result<FileInfo> GetFileInfo(const std::string& path) override; + Result<FileInfoVector> GetFileInfo(const FileSelector& select) override; + + FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result<std::shared_ptr<io::InputStream>> OpenInputStream( + const std::string& path) override; + Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override; + Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const std::string& path) override; + Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const FileInfo& info) override; + + Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync( + const std::string& path) override; + Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync( + const FileInfo& info) override; + Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync( + const std::string& path) override; + Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync( + const FileInfo& info) override; + + Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata) override; + Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata) override; + + protected: + SubTreeFileSystem() = default; + + const std::string base_path_; + std::shared_ptr<FileSystem> base_fs_; + + Result<std::string> PrependBase(const std::string& s) const; + Result<std::string> PrependBaseNonEmpty(const std::string& s) const; + Result<std::string> StripBase(const std::string& s) const; + Status FixInfo(FileInfo* info) const; + + static Result<std::string> NormalizeBasePath( + std::string base_path, const std::shared_ptr<FileSystem>& base_fs); +}; + +/// \brief A FileSystem implementation that delegates to another +/// implementation but inserts latencies at various points. +class ARROW_EXPORT SlowFileSystem : public FileSystem { + public: + SlowFileSystem(std::shared_ptr<FileSystem> base_fs, + std::shared_ptr<io::LatencyGenerator> latencies); + SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency); + SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency, + int32_t seed); + + std::string type_name() const override { return "slow"; } + bool Equals(const FileSystem& other) const override; + Result<std::string> PathFromUri(const std::string& uri_string) const override; + + /// \cond FALSE + using FileSystem::CreateDir; + using FileSystem::DeleteDirContents; + using FileSystem::GetFileInfo; + using FileSystem::OpenAppendStream; + using FileSystem::OpenOutputStream; + /// \endcond + + Result<FileInfo> GetFileInfo(const std::string& path) override; + Result<FileInfoVector> GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + Status DeleteDir(const std::string& path) override; + Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override; + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result<std::shared_ptr<io::InputStream>> OpenInputStream( + const std::string& path) override; + Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override; + Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const std::string& path) override; + Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( + const FileInfo& info) override; + Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata) override; + Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( + const std::string& path, + const std::shared_ptr<const KeyValueMetadata>& metadata) override; + + protected: + std::shared_ptr<FileSystem> base_fs_; + std::shared_ptr<io::LatencyGenerator> latencies_; +}; + +/// \brief Ensure all registered filesystem implementations are finalized. +/// +/// Individual finalizers may wait for concurrent calls to finish so as to avoid +/// race conditions. After this function has been called, all filesystem APIs +/// will fail with an error. +/// +/// The user is responsible for synchronization of calls to this function. +void EnsureFinalized(); + +/// \defgroup filesystem-factories Functions for creating FileSystem instances +/// +/// @{ + +/// \brief Create a new FileSystem by URI +/// +/// Recognized schemes are "file", "mock", "hdfs", "viewfs", "s3", +/// "gs" and "gcs". +/// +/// Support for other schemes can be added using RegisterFileSystemFactory. +/// +/// \param[in] uri a URI-based path, ex: file:///some/local/path +/// \param[out] out_path (optional) Path inside the filesystem. +/// \return out_fs FileSystem instance. +ARROW_EXPORT +Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri, + std::string* out_path = NULLPTR); + +/// \brief Create a new FileSystem by URI with a custom IO context +/// +/// Recognized schemes are "file", "mock", "hdfs", "viewfs", "s3", +/// "gs" and "gcs". +/// +/// Support for other schemes can be added using RegisterFileSystemFactory. +/// +/// \param[in] uri a URI-based path, ex: file:///some/local/path +/// \param[in] io_context an IOContext which will be associated with the filesystem +/// \param[out] out_path (optional) Path inside the filesystem. +/// \return out_fs FileSystem instance. +ARROW_EXPORT +Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri, + const io::IOContext& io_context, + std::string* out_path = NULLPTR); + +/// \brief Create a new FileSystem by URI +/// +/// Support for other schemes can be added using RegisterFileSystemFactory. +/// +/// Same as FileSystemFromUri, but in addition also recognize non-URIs +/// and treat them as local filesystem paths. Only absolute local filesystem +/// paths are allowed. +ARROW_EXPORT +Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath( + const std::string& uri, std::string* out_path = NULLPTR); + +/// \brief Create a new FileSystem by URI with a custom IO context +/// +/// Support for other schemes can be added using RegisterFileSystemFactory. +/// +/// Same as FileSystemFromUri, but in addition also recognize non-URIs +/// and treat them as local filesystem paths. Only absolute local filesystem +/// paths are allowed. +ARROW_EXPORT +Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath( + const std::string& uri, const io::IOContext& io_context, + std::string* out_path = NULLPTR); + +/// @} + +/// \defgroup filesystem-factory-registration Helpers for FileSystem registration +/// +/// @{ + +/// \brief Register a FileSystem factory +/// +/// Support for custom URI schemes can be added by registering a factory +/// for the corresponding FileSystem. +/// +/// \param[in] scheme a Uri scheme which the factory will handle. +/// If a factory has already been registered for a scheme, +/// the new factory will be ignored. +/// \param[in] factory a function which can produce a FileSystem for Uris which match +/// scheme. +/// \param[in] finalizer a function which must be called to finalize the factory before +/// the process exits, or nullptr if no finalization is necessary. +/// \return raises KeyError if a name collision occurs. +ARROW_EXPORT Status RegisterFileSystemFactory(std::string scheme, + FileSystemFactory factory, + std::function<void()> finalizer = {}); + +/// \brief Register FileSystem factories from a shared library +/// +/// FileSystem implementations may be housed in separate shared libraries and only +/// registered when the shared library is explicitly loaded. FileSystemRegistrar is +/// provided to simplify definition of such libraries: each instance at namespace scope +/// in the library will register a factory for a scheme. Any library which uses +/// FileSystemRegistrars and which must be dynamically loaded should be loaded using +/// LoadFileSystemFactories(), which will additionally merge registries are if necessary +/// (static linkage to arrow can produce isolated registries). +ARROW_EXPORT Status LoadFileSystemFactories(const char* libpath); + +struct ARROW_EXPORT FileSystemRegistrar { + /// \brief Register a FileSystem factory at load time + /// + /// Support for custom URI schemes can be added by registering a factory for the + /// corresponding FileSystem. An instance of this helper can be defined at namespace + /// scope to cause the factory to be registered at load time. + /// + /// Global constructors will finish execution before main() starts if the registrar is + /// linked into the same binary as main(), or before dlopen()/LoadLibrary() returns if + /// the library in which the registrar is defined is dynamically loaded. + /// + /// \code + /// FileSystemRegistrar kSlowFileSystemModule{ + /// "slowfile", + /// [](const Uri& uri, const io::IOContext& io_context, std::string* out_path) + /// ->Result<std::shared_ptr<FileSystem>> { + /// auto local_uri = "file" + uri.ToString().substr(uri.scheme().size()); + /// ARROW_ASSIGN_OR_RAISE(auto base_fs, + /// FileSystemFromUri(local_uri, io_context, out_path)); + /// double average_latency = 1; + /// int32_t seed = 0xDEADBEEF; + /// ARROW_ASSIGN_OR_RAISE(auto params, uri.query_item()); + /// for (const auto& [key, value] : params) { + /// if (key == "average_latency") { + /// average_latency = std::stod(value); + /// } + /// if (key == "seed") { + /// seed = std::stoi(value, nullptr, /*base=*/16); + /// } + /// } + /// return std::make_shared<SlowFileSystem>(base_fs, average_latency, seed); + /// })); + /// \endcode + /// + /// \param[in] scheme a Uri scheme which the factory will handle. + /// If a factory has already been registered for a scheme, the + /// new factory will be ignored. + /// \param[in] factory a function which can produce a FileSystem for Uris which match + /// scheme. + /// \param[in] finalizer a function which must be called to finalize the factory before + /// the process exits, or nullptr if no finalization is necessary. + FileSystemRegistrar(std::string scheme, FileSystemFactory factory, + std::function<void()> finalizer = {}); +}; + +#define ARROW_REGISTER_FILESYSTEM(scheme, factory_function, finalizer) \ + ::arrow20::fs::FileSystemRegistrar { \ + scheme, ::arrow20::fs::FileSystemFactory{factory_function, __FILE__, __LINE__}, \ + finalizer \ + } + +/// @} + +namespace internal { +ARROW_EXPORT void* GetFileSystemRegistry(); +} // namespace internal + +/// \brief Copy files, including from one FileSystem to another +/// +/// If a source and destination are resident in the same FileSystem FileSystem::CopyFile +/// will be used, otherwise the file will be opened as a stream in both FileSystems and +/// chunks copied from the source to the destination. No directories will be created. +ARROW_EXPORT +Status CopyFiles(const std::vector<FileLocator>& sources, + const std::vector<FileLocator>& destinations, + const io::IOContext& io_context = io::default_io_context(), + int64_t chunk_size = 1024 * 1024, bool use_threads = true); + +/// \brief Copy selected files, including from one FileSystem to another +/// +/// Directories will be created under the destination base directory as needed. +ARROW_EXPORT +Status CopyFiles(const std::shared_ptr<FileSystem>& source_fs, + const FileSelector& source_sel, + const std::shared_ptr<FileSystem>& destination_fs, + const std::string& destination_base_dir, + const io::IOContext& io_context = io::default_io_context(), + int64_t chunk_size = 1024 * 1024, bool use_threads = true); + +struct FileSystemGlobalOptions { + /// Path to a single PEM file holding all TLS CA certificates + /// + /// If empty, the underlying TLS library's defaults will be used. + std::string tls_ca_file_path; + + /// Path to a directory holding TLS CA certificates in individual PEM files + /// named along the OpenSSL "hashed" format. + /// + /// If empty, the underlying TLS library's defaults will be used. + std::string tls_ca_dir_path; +}; + +/// EXPERIMENTAL: optional global initialization routine +/// +/// This is for environments (such as manylinux) where the path +/// to TLS CA certificates needs to be configured at runtime. +ARROW_EXPORT +Status Initialize(const FileSystemGlobalOptions& options); + +} // namespace fs +} // namespace arrow20 |
