// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include #include "arrow/io/caching.h" #include "parquet/metadata.h" // IWYU pragma: keep #include "parquet/platform.h" #include "parquet/properties.h" namespace parquet { class ColumnReader; class FileMetaData; class PageReader; class RandomAccessSource; class RowGroupMetaData; class PARQUET_EXPORT RowGroupReader { public: // Forward declare a virtual class 'Contents' to aid dependency injection and more // easily create test fixtures // An implementation of the Contents class is defined in the .cc file struct Contents { virtual ~Contents() {} virtual std::unique_ptr GetColumnPageReader(int i) = 0; virtual const RowGroupMetaData* metadata() const = 0; virtual const ReaderProperties* properties() const = 0; }; explicit RowGroupReader(std::unique_ptr contents); // Returns the rowgroup metadata const RowGroupMetaData* metadata() const; // Construct a ColumnReader for the indicated row group-relative // column. Ownership is shared with the RowGroupReader. std::shared_ptr Column(int i); std::unique_ptr GetColumnPageReader(int i); private: // Holds a pointer to an instance of Contents implementation std::unique_ptr contents_; }; class PARQUET_EXPORT ParquetFileReader { public: // Declare a virtual class 'Contents' to aid dependency injection and more // easily create test fixtures // An implementation of the Contents class is defined in the .cc file struct PARQUET_EXPORT Contents { static std::unique_ptr Open( std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props = default_reader_properties(), std::shared_ptr metadata = NULLPTR); virtual ~Contents() = default; // Perform any cleanup associated with the file contents virtual void Close() = 0; virtual std::shared_ptr GetRowGroup(int i) = 0; virtual std::shared_ptr metadata() const = 0; }; ParquetFileReader(); ~ParquetFileReader(); // Create a reader from some implementation of parquet-cpp's generic file // input interface // // If you cannot provide exclusive access to your file resource, create a // subclass of RandomAccessSource that wraps the shared resource ARROW_DEPRECATED("Use arrow::io::RandomAccessFile version") static std::unique_ptr Open( std::unique_ptr source, const ReaderProperties& props = default_reader_properties(), std::shared_ptr metadata = NULLPTR); // Create a file reader instance from an Arrow file object. Thread-safety is // the responsibility of the file implementation static std::unique_ptr Open( std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props = default_reader_properties(), std::shared_ptr metadata = NULLPTR); // API Convenience to open a serialized Parquet file on disk, using Arrow IO // interfaces. static std::unique_ptr OpenFile( const std::string& path, bool memory_map = true, const ReaderProperties& props = default_reader_properties(), std::shared_ptr metadata = NULLPTR); void Open(std::unique_ptr contents); void Close(); // The RowGroupReader is owned by the FileReader std::shared_ptr RowGroup(int i); // Returns the file metadata. Only one instance is ever created std::shared_ptr metadata() const; /// Pre-buffer the specified column indices in all row groups. /// /// Readers can optionally call this to cache the necessary slices /// of the file in-memory before deserialization. Arrow readers can /// automatically do this via an option. This is intended to /// increase performance when reading from high-latency filesystems /// (e.g. Amazon S3). /// /// After calling this, creating readers for row groups/column /// indices that were not buffered may fail. Creating multiple /// readers for the a subset of the buffered regions is /// acceptable. This may be called again to buffer a different set /// of row groups/columns. /// /// If memory usage is a concern, note that data will remain /// buffered in memory until either \a PreBuffer() is called again, /// or the reader itself is destructed. Reading - and buffering - /// only one row group at a time may be useful. void PreBuffer(const std::vector& row_groups, const std::vector& column_indices, const ::arrow::io::AsyncContext& ctx, const ::arrow::io::CacheOptions& options); private: // Holds a pointer to an instance of Contents implementation std::unique_ptr contents_; }; // Read only Parquet file metadata std::shared_ptr PARQUET_EXPORT ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source); /// \brief Scan all values in file. Useful for performance testing /// \param[in] columns the column numbers to scan. If empty scans all /// \param[in] column_batch_size number of values to read at a time when scanning column /// \param[in] reader a ParquetFileReader instance /// \return number of semantic rows in file PARQUET_EXPORT int64_t ScanFileContents(std::vector columns, const int32_t column_batch_size, ParquetFileReader* reader); } // namespace parquet