// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include #include #include #include #include "arrow/filesystem/type_fwd.h" #include "arrow/io/type_fwd.h" #include "arrow/type_fwd.h" #include "arrow/util/compare.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" #include "arrow/util/windows_fixup.h" namespace arrow { namespace fs { // A system clock time point expressed as a 64-bit (or more) number of // nanoseconds since the epoch. using TimePoint = std::chrono::time_point; ARROW_EXPORT std::string ToString(FileType); ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType); static const int64_t kNoSize = -1; static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1)); /// \brief FileSystem entry info struct ARROW_EXPORT FileInfo : public util::EqualityComparable { FileInfo() = default; FileInfo(FileInfo&&) = default; FileInfo& operator=(FileInfo&&) = default; FileInfo(const FileInfo&) = default; FileInfo& operator=(const FileInfo&) = default; explicit FileInfo(std::string path, FileType type = FileType::Unknown) : path_(std::move(path)), type_(type) {} /// The file type FileType type() const { return type_; } void set_type(FileType type) { type_ = type; } /// The full file path in the filesystem const std::string& path() const { return path_; } void set_path(std::string path) { path_ = std::move(path); } /// The file base name (component after the last directory separator) std::string base_name() const; // The directory base name (component before the file base name). std::string dir_name() const; /// The size in bytes, if available /// /// Only regular files are guaranteed to have a size. int64_t size() const { return size_; } void set_size(int64_t size) { size_ = size; } /// The file extension (excluding the dot) std::string extension() const; /// The time of last modification, if available TimePoint mtime() const { return mtime_; } void set_mtime(TimePoint mtime) { mtime_ = mtime; } bool IsFile() const { return type_ == FileType::File; } bool IsDirectory() const { return type_ == FileType::Directory; } bool Equals(const FileInfo& other) const { return type() == other.type() && path() == other.path() && size() == other.size() && mtime() == other.mtime(); } std::string ToString() const; /// Function object implementing less-than comparison and hashing by /// path, to support sorting infos, using them as keys, and other /// interactions with the STL. struct ByPath { bool operator()(const FileInfo& l, const FileInfo& r) const { return l.path() < r.path(); } size_t operator()(const FileInfo& i) const { return std::hash{}(i.path()); } }; protected: std::string path_; FileType type_ = FileType::Unknown; int64_t size_ = kNoSize; TimePoint mtime_ = kNoTime; }; ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const FileInfo&); /// \brief File selector for filesystem APIs struct ARROW_EXPORT FileSelector { /// The directory in which to select files. /// If the path exists but doesn't point to a directory, this should be an error. std::string base_dir; /// The behavior if `base_dir` isn't found in the filesystem. If false, /// an error is returned. If true, an empty selection is returned. bool allow_not_found; /// Whether to recurse into subdirectories. bool recursive; /// The maximum number of subdirectories to recurse into. int32_t max_recursion; FileSelector() : allow_not_found(false), recursive(false), max_recursion(INT32_MAX) {} }; /// \brief FileSystem, path pair struct ARROW_EXPORT FileLocator { std::shared_ptr filesystem; std::string path; }; /// \brief Abstract file system API class ARROW_EXPORT FileSystem : public std::enable_shared_from_this { public: virtual ~FileSystem(); virtual std::string type_name() const = 0; /// Normalize path for the given filesystem /// /// The default implementation of this method is a no-op, but subclasses /// may allow normalizing irregular path forms (such as Windows local paths). virtual Result NormalizePath(std::string path); virtual bool Equals(const FileSystem& other) const = 0; virtual bool Equals(const std::shared_ptr& other) const { return Equals(*other); } /// Get info for the given target. /// /// Any symlink is automatically dereferenced, recursively. /// A nonexistent or unreachable file returns an Ok status and /// has a FileType of value NotFound. An error status indicates /// a truly exceptional condition (low-level I/O error, etc.). virtual Result GetFileInfo(const std::string& path) = 0; /// Same, for many targets at once. virtual Result> GetFileInfo( const std::vector& paths); /// Same, according to a selector. /// /// The selector's base directory will not be part of the results, even if /// it exists. /// If it doesn't exist, see `FileSelector::allow_not_found`. virtual Result> GetFileInfo(const FileSelector& select) = 0; /// Create a directory and subdirectories. /// /// This function succeeds if the directory already exists. virtual Status CreateDir(const std::string& path, bool recursive = true) = 0; /// Delete a directory and its contents, recursively. virtual Status DeleteDir(const std::string& path) = 0; /// Delete a directory's contents, recursively. /// /// Like DeleteDir, but doesn't delete the directory itself. /// Passing an empty path ("" or "/") is disallowed, see DeleteRootDirContents. virtual Status DeleteDirContents(const std::string& path) = 0; /// EXPERIMENTAL: Delete the root directory's contents, recursively. /// /// Implementations may decide to raise an error if this operation is /// too dangerous. // NOTE: may decide to remove this if it's deemed not useful virtual Status DeleteRootDirContents() = 0; /// Delete a file. virtual Status DeleteFile(const std::string& path) = 0; /// Delete many files. /// /// The default implementation issues individual delete operations in sequence. virtual Status DeleteFiles(const std::vector& paths); /// Move / rename a file or directory. /// /// If the destination exists: /// - if it is a non-empty directory, an error is returned /// - otherwise, if it has the same type as the source, it is replaced /// - otherwise, behavior is unspecified (implementation-dependent). virtual Status Move(const std::string& src, const std::string& dest) = 0; /// Copy a file. /// /// If the destination exists and is a directory, an error is returned. /// Otherwise, it is replaced. virtual Status CopyFile(const std::string& src, const std::string& dest) = 0; /// Open an input stream for sequential reading. virtual Result> OpenInputStream( const std::string& path) = 0; /// Open an input stream for sequential reading. /// /// This override assumes the given FileInfo validly represents the file's /// characteristics, and may optimize access depending on them (for example /// avoid querying the file size or its existence). virtual Result> OpenInputStream(const FileInfo& info); /// Open an input file for random access reading. virtual Result> OpenInputFile( const std::string& path) = 0; /// Open an input file for random access reading. /// /// This override assumes the given FileInfo validly represents the file's /// characteristics, and may optimize access depending on them (for example /// avoid querying the file size or its existence). virtual Result> OpenInputFile( const FileInfo& info); /// Open an output stream for sequential writing. /// /// If the target already exists, existing data is truncated. virtual Result> OpenOutputStream( const std::string& path) = 0; /// Open an output stream for appending. /// /// If the target doesn't exist, a new empty file is created. virtual Result> OpenAppendStream( const std::string& path) = 0; }; /// \brief A FileSystem implementation that delegates to another /// implementation after prepending a fixed base path. /// /// This is useful to expose a logical view of a subtree of a filesystem, /// for example a directory in a LocalFileSystem. /// This works on abstract paths, i.e. paths using forward slashes and /// and a single root "/". Windows paths are not guaranteed to work. /// This makes no security guarantee. For example, symlinks may allow to /// "escape" the subtree and access other parts of the underlying filesystem. class ARROW_EXPORT SubTreeFileSystem : public FileSystem { public: // This constructor may abort if base_path is invalid. explicit SubTreeFileSystem(const std::string& base_path, std::shared_ptr base_fs); ~SubTreeFileSystem() override; std::string type_name() const override { return "subtree"; } std::string base_path() const { return base_path_; } std::shared_ptr base_fs() const { return base_fs_; } Result NormalizePath(std::string path) override; bool Equals(const FileSystem& other) const override; /// \cond FALSE using FileSystem::GetFileInfo; /// \endcond Result GetFileInfo(const std::string& path) override; Result> GetFileInfo(const FileSelector& select) override; Status CreateDir(const std::string& path, bool recursive = true) override; Status DeleteDir(const std::string& path) override; Status DeleteDirContents(const std::string& path) override; Status DeleteRootDirContents() override; Status DeleteFile(const std::string& path) override; Status Move(const std::string& src, const std::string& dest) override; Status CopyFile(const std::string& src, const std::string& dest) override; Result> OpenInputStream( const std::string& path) override; Result> OpenInputStream(const FileInfo& info) override; Result> OpenInputFile( const std::string& path) override; Result> OpenInputFile( const FileInfo& info) override; Result> OpenOutputStream( const std::string& path) override; Result> OpenAppendStream( const std::string& path) override; protected: SubTreeFileSystem() {} const std::string base_path_; std::shared_ptr base_fs_; std::string PrependBase(const std::string& s) const; Status PrependBaseNonEmpty(std::string* s) const; Result StripBase(const std::string& s) const; Status FixInfo(FileInfo* info) const; static Result NormalizeBasePath( std::string base_path, const std::shared_ptr& base_fs); }; /// \brief A FileSystem implementation that delegates to another /// implementation but inserts latencies at various points. class ARROW_EXPORT SlowFileSystem : public FileSystem { public: SlowFileSystem(std::shared_ptr base_fs, std::shared_ptr latencies); SlowFileSystem(std::shared_ptr base_fs, double average_latency); SlowFileSystem(std::shared_ptr base_fs, double average_latency, int32_t seed); std::string type_name() const override { return "slow"; } bool Equals(const FileSystem& other) const override; using FileSystem::GetFileInfo; Result GetFileInfo(const std::string& path) override; Result> GetFileInfo(const FileSelector& select) override; Status CreateDir(const std::string& path, bool recursive = true) override; Status DeleteDir(const std::string& path) override; Status DeleteDirContents(const std::string& path) override; Status DeleteRootDirContents() override; Status DeleteFile(const std::string& path) override; Status Move(const std::string& src, const std::string& dest) override; Status CopyFile(const std::string& src, const std::string& dest) override; Result> OpenInputStream( const std::string& path) override; Result> OpenInputStream(const FileInfo& info) override; Result> OpenInputFile( const std::string& path) override; Result> OpenInputFile( const FileInfo& info) override; Result> OpenOutputStream( const std::string& path) override; Result> OpenAppendStream( const std::string& path) override; protected: std::shared_ptr base_fs_; std::shared_ptr latencies_; }; /// \defgroup filesystem-factories Functions for creating FileSystem instances /// /// @{ /// \brief Create a new FileSystem by URI /// /// Recognized schemes are "file", "mock", "hdfs" and "s3fs". /// /// \param[in] uri a URI-based path, ex: file:///some/local/path /// \param[out] out_path (optional) Path inside the filesystem. /// \return out_fs FileSystem instance. ARROW_EXPORT Result> FileSystemFromUri(const std::string& uri, std::string* out_path = NULLPTR); /// \brief Create a new FileSystem by URI /// /// Same as FileSystemFromUri, but in addition also recognize non-URIs /// and treat them as local filesystem paths. Only absolute local filesystem /// paths are allowed. ARROW_EXPORT Result> FileSystemFromUriOrPath( const std::string& uri, std::string* out_path = NULLPTR); /// @} /// \brief Copy files, including from one FileSystem to another /// /// If a source and destination are resident in the same FileSystem FileSystem::CopyFile /// will be used, otherwise the file will be opened as a stream in both FileSystems and /// chunks copied from the source to the destination. No directories will be created. ARROW_EXPORT Status CopyFiles(const std::vector& sources, const std::vector& destinations, int64_t chunk_size = 1024 * 1024, bool use_threads = true); /// \brief Copy selected files, including from one FileSystem to another /// /// Directories will be created under the destination base directory as needed. ARROW_EXPORT Status CopyFiles(const std::shared_ptr& source_fs, const FileSelector& source_sel, const std::shared_ptr& destination_fs, const std::string& destination_base_dir, int64_t chunk_size = 1024 * 1024, bool use_threads = true); struct FileSystemGlobalOptions { /// Path to a single PEM file holding all TLS CA certificates /// /// If empty, the underlying TLS library's defaults will be used. std::string tls_ca_file_path; /// Path to a directory holding TLS CA certificates in individual PEM files /// named along the OpenSSL "hashed" format. /// /// If empty, the underlying TLS library's defaults will be used. std::string tls_ca_dir_path; }; /// Experimental: optional global initialization routine /// /// This is for environments (such as manylinux) where the path /// to TLS CA certificates needs to be configured at runtime. ARROW_EXPORT Status Initialize(const FileSystemGlobalOptions& options); } // namespace fs } // namespace arrow