package com.amazonaws.athena.connector.lambda.data; /*- * #%L * Amazon Athena Query Federation SDK * %% * Copyright (C) 2019 Amazon Web Services * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.amazonaws.athena.connector.lambda.domain.predicate.ConstraintEvaluator; import com.google.common.base.MoreObjects; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.VectorLoader; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.VectorUnloader; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.beans.Transient; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Objects; import static com.amazonaws.athena.connector.lambda.data.BlockUtils.fieldToString; import static java.util.Objects.requireNonNull; /** * This class is used to provide a convenient interface for working (reading/writing) Apache Arrow Batches. As such * this class is mostly a holder for an Apache Arrow Schema and the associated VectorSchema (used for read/write). * The class also includes helper functions for easily loading/unloading data in the form of Arrow Batches. * * @note While using this class as a holder to encapsulate nuances of Apache Arrow can simplify your programming model * and make it easier to get started, using setValue(...), setComplexValue(...), and any of the related helpers to * write data to the Apache Arrow structures is less performant than using Apache Arrow's native interfaces. If your usecase * and source data can be read in a columnar fashion you can achieve significantly (50% - 200%) better performance by * avoiding setValue(...) and setComplexValue(...). In our testing conversion to Apache Arrow was not a significant * bottleneck and instead represented extra latency which could be hidden through parallelism and pipelining. This is why * we opted to offer these convenience methods. *

* Remember to always close your Block(s) when you are done with them. If you are using a BlockAllocator it is still * recommended that you close() Blocks explicitly wherever possible vs. depending on BlockAllocator.close() to free * resources. Closing Blocks earlier will reduce peak memory demands and reduce the chance that you exhaust your Apache * Arrow memory pool. */ public class Block extends SchemaAware implements AutoCloseable { private static final Logger logger = LoggerFactory.getLogger(Block.class); //Used to identify which BlockAllocator owns the underlying memory resources used in this Block for debugging purposes. //Not included in equality or hashcode. private final String allocatorId; //The schema of the block private final Schema schema; //The VectorSchemaRoot which can be used to read/write values to/from the underlying Apache Arrow buffers that //for the Arrow Batch of rows. private final VectorSchemaRoot vectorSchema; //Used to constrain writes to the block, be default we use an emptyEvaluator that allows all writes. //Note that we will _NOT_ close this ConstraintEvaluator because we may not own it and the emptyEvaluator //has no resources that could leak. private ConstraintEvaluator constraintEvaluator = ConstraintEvaluator.emptyEvaluator(); /** * Used by a BlockAllocator to construct a block by setting the key values that a Block 'holds'. Most of the meaningful * construction actually takes place within the BlockAllocator that calls this constructor. * * @param allocatorId Identifier of the BlockAllocator that owns the Block's memory resources. * @param schema The schema of the data that can be read/written to the provided VectorSchema. * @param vectorSchema Used to read/write values from the Apache Arrow memory buffers owned by this object. */ protected Block(String allocatorId, Schema schema, VectorSchemaRoot vectorSchema) { requireNonNull(allocatorId, "allocatorId is null"); requireNonNull(schema, "schema is null"); requireNonNull(vectorSchema, "vectorSchema is null"); this.allocatorId = allocatorId; this.schema = schema; this.vectorSchema = vectorSchema; } /** * Used to constrain writes to the Block. * * @param constraintEvaluator The ConstraintEvaluator to use check if we should allow a value to be written to the Block. * @note Setting the ConstraintEvaluator to null disables constraints. */ public void constrain(ConstraintEvaluator constraintEvaluator) { this.constraintEvaluator = (constraintEvaluator != null) ? constraintEvaluator : ConstraintEvaluator.emptyEvaluator(); } /** * Returns the ConstraintEvaluator used by the block. */ public ConstraintEvaluator getConstraintEvaluator() { return constraintEvaluator; } public String getAllocatorId() { return allocatorId; } public Schema getSchema() { return schema; } /** * Writes the provided value to the specified field on the specified row. This method does _not_ update the * row count on the underlying Apache Arrow VectorSchema. You must call setRowCount(...) to ensure the values * your have written are considered 'valid rows' and thus available when you attempt to serialize this Block. This * method replies on BlockUtils' field conversion/coercion logic to convert the provided value into a type that * matches Apache Arrow's supported serialization format. For more details on coercion please see @BlockUtils * * @param fieldName The name of the field you wish to write to. * @param row The row number to write to. Note that Apache Arrow Blocks begin with row 0 just like a typical array. * @param value The value you wish to write. * @return True if the value was written to the Block, False if the value was not written due to failing a constraint. * @note This method will throw an NPE if you call with with a non-existent field. You can use offerValue(...) * to ignore non-existent fields. This can be useful when you are writing results and want to avoid checking * if a field has been requested. One such example is when a query projects only a subset of columns and your * underlying data store is not columnar. */ public boolean setValue(String fieldName, int row, Object value) { if (constraintEvaluator.apply(fieldName, value)) { BlockUtils.setValue(getFieldVector(fieldName), row, value); return true; } return false; } /** * Attempts to write the provided value to the specified field on the specified row. This method does _not_ update the * row count on the underlying Apache Arrow VectorSchema. You must call setRowCount(...) to ensure the values * your have written are considered 'valid rows' and thus available when you attempt to serialize this Block. This * method replies on BlockUtils' field conversion/coercion logic to convert the provided value into a type that * matches Apache Arrow's supported serialization format. For more details on coercion please see @BlockUtils * * @param fieldName The name of the field you wish to write to. * @param row The row number to write to. Note that Apache Arrow Blocks begin with row 0 just like a typical array. * @param value The value you wish to write. * @return True if the value was written to the Block (even if the field is missing from the Block), * False if the value was not written due to failing a constraint. * @note This method will take no action if the provided fieldName is not a valid field in this Block's Schema. * In such cases the method will return true. */ public boolean offerValue(String fieldName, int row, Object value) { if (constraintEvaluator.apply(fieldName, value)) { FieldVector vector = getFieldVector(fieldName); if (vector != null) { BlockUtils.setValue(vector, row, value); } return true; } return false; } /** * Attempts to set the provided value for the given field name and row. If the Block's schema does not * contain such a field, this method does nothing and returns false. * * @param fieldName The name of the field you wish to write to. * @param row The row number to write to. Note that Apache Arrow Blocks begin with row 0 just like a typical array. * @param value The value you wish to write. * @return True if the value was written to the Block, False if the value was not written due to failing a constraint. * @note This method will throw an NPE if you call with with a non-existent field. You can use offerComplexValue(...) * to ignore non-existent fields. This can be useful when you are writing results and want to avoid checking * if a field has been requested. One such example is when a query projects only a subset of columns and your * underlying data store is not columnar. */ public boolean setComplexValue(String fieldName, int row, FieldResolver fieldResolver, Object value) { FieldVector vector = getFieldVector(fieldName); BlockUtils.setComplexValue(vector, row, fieldResolver, value); return true; } /** * Attempts to set the provided value for the given field name and row. If the Block's schema does not * contain such a field, this method does nothing and returns false. * * @param fieldName The name of the field you wish to write to. * @param row The row number to write to. Note that Apache Arrow Blocks begin with row 0 just like a typical array. * @param value The value you wish to write. * @return True if the value was written to the Block (even if the field is missing from the Block), * False if the value was not written due to failing a constraint. * @note This method will take no action if the provided fieldName is not a valid field in this Block's Schema. * In such cases the method will return true. */ public boolean offerComplexValue(String fieldName, int row, FieldResolver fieldResolver, Object value) { FieldVector vector = getFieldVector(fieldName); if (vector != null) { BlockUtils.setComplexValue(vector, row, fieldResolver, value); } return true; } /** * Provides access to the Apache Arrow Vector Schema when direct access to Apache Arrow is required. * * @return The Apache Arrow Vector Schema. */ protected VectorSchemaRoot getVectorSchema() { return vectorSchema; } /** * Sets the valid row count on the underlying Apache Arrow Vector Schema. * * @param rowCount The row count to set. * @Note If you do not set this value then block may not serialize correctly (too few rows) or rows may * not be readable. */ public void setRowCount(int rowCount) { vectorSchema.setRowCount(rowCount); } /** * Returns the current row count as set by calling setRowCount(...) * * @return The current valud row count for the Apache Arrow Vector Schema. */ public int getRowCount() { return vectorSchema.getRowCount(); } /** * Provides access to the Apache Arrow FieldReader for the given field name. * * @param fieldName The name of the field to retrieve. * @return The FieldReader that can be used to read values from the Block for the specified field. * @note This method throws NPE if the requested field name is not a valid field name in the block's Schema. * Additionally, for accessing nested field you must request the parent field and then call reader(String fieldName) * on the parent FieldReader. You can find some examples of how to use Apache Arrow for complex/nested types in * the UnitTest for this class or BlockUtils.java. */ public FieldReader getFieldReader(String fieldName) { return vectorSchema.getVector(fieldName).getReader(); } /** * Provides access to the Apache Arrow FieldVector which can be used to write values for the given field name. * * @param fieldName The name of the field to retrieve. * @return The FieldVector that can be used to read values from the Block for the specified field or NULL if the field * is not in this Block's Schema. * @note Additionally, for accessing nested field you must request the parent field and then call the apprioriate * method (based on type) to get the child field's FieldVector. You can find some examples of how to use Apache Arrow * for complex/nested types in the UnitTest for this class or BlockUtils.java. */ public FieldVector getFieldVector(String fieldName) { return vectorSchema.getVector(fieldName); } /** * Provides access to the list of all top-level FieldReaders in this Block. * * @return List containing the top-level FieldReaders for this block. */ public List getFieldReaders() { List readers = new ArrayList<>(); for (FieldVector next : vectorSchema.getFieldVectors()) { readers.add(next.getReader()); } return readers; } /** * Calculates the current used size in 'bytes' for all Apache Arrow Buffers that comprise the row data for * this Block. * * @return The used bytes of row data in this Block. * @note This value is likley smaller than the actually memory held by this Block as it only counts the 'used' portion * of the pre-allocated Apache Arrow Buffers. It is generally safer to think about this value as the size of the Block * if you serialize it and thus is useful for controlling the size of the Block responses sent to Athena. */ @Transient public long getSize() { long size = 0; for (FieldVector next : vectorSchema.getFieldVectors()) { size += next.getBufferSize(); } return size; } /** * Provides access to the list of all top-level FieldVectors in this Block. * * @return List containing the top-level FieldVectors for this block. */ public List getFieldVectors() { return vectorSchema.getFieldVectors(); } /** * Used to unload the Apache Arrow data in this Block in preparation for Serialization. * * @return An ArrowRecordBatch containing all row data in this Block for use in serializing the Block. */ public ArrowRecordBatch getRecordBatch() { VectorUnloader vectorUnloader = new VectorUnloader(vectorSchema); return vectorUnloader.getRecordBatch(); } /** * Used to load Apache Arrow data into this Block after it has been deserialized. * * @param batch An ArrowRecordBatch containing all row data you'd like to load into this Block. * @note The batch is closed after being loaded to avoid memory leaks or data corruption since the buffers * associated with the batch are now owned by this Block. Closing the batch essentially decrements the referrence * count in the Arrow Allocator. */ public void loadRecordBatch(ArrowRecordBatch batch) { VectorLoader vectorLoader = new VectorLoader(vectorSchema); vectorLoader.load(batch); batch.close(); } /** * Frees all Apache Arrow Buffers and resources associated with this block. * * @throws Exception */ @Override public void close() throws Exception { this.vectorSchema.close(); } @Override protected Schema internalGetSchema() { return schema; } /** * Provides some basic equality checking for a Block. This method has some draw backs in that is isn't a deep equality * and will not work for some large complex blocks. At present this method is useful for testing purposes but may be refactored * in a future release. */ @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Block that = (Block) o; if (this.schema.getFields().size() != that.schema.getFields().size()) { return false; } if (this.vectorSchema.getRowCount() != that.vectorSchema.getRowCount()) { return false; } try { for (Field next : this.schema.getFields()) { FieldReader thisReader = vectorSchema.getVector(next.getName()).getReader(); FieldReader thatReader = that.vectorSchema.getVector(next.getName()).getReader(); for (int i = 0; i < this.vectorSchema.getRowCount(); i++) { thisReader.setPosition(i); thatReader.setPosition(i); if (ArrowTypeComparator.compare(thisReader, thisReader.readObject(), thatReader.readObject()) != 0) { return false; } } } } catch (IllegalArgumentException ex) { //can happen when comparator doesn't support the type throw ex; } catch (RuntimeException ex) { //There are many differences which can cause an exception, easier to handle them this way logger.warn("equals: ", ex); return false; } return true; } /** * Provides some basic equality checking for a Block ignoring ordering. This method has some draw backs in that is * isn't a deep equality and will not work for some large complex blocks. At present this method is useful for testing * purposes but may be refactored in a future release. */ public boolean equalsAsSet(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Block that = (Block) o; if (this.schema.getFields().size() != that.schema.getFields().size()) { return false; } if (this.vectorSchema.getRowCount() != that.vectorSchema.getRowCount()) { return false; } try { for (Field next : this.schema.getFields()) { FieldReader thisReader = vectorSchema.getVector(next.getName()).getReader(); FieldReader thatReader = that.vectorSchema.getVector(next.getName()).getReader(); for (int i = 0; i < this.vectorSchema.getRowCount(); i++) { thisReader.setPosition(i); Types.MinorType type = thisReader.getMinorType(); Object val = thisReader.readObject(); boolean matched = false; for (int j = 0; j < that.vectorSchema.getRowCount(); j++) { thatReader.setPosition(j); if (ArrowTypeComparator.compare(thatReader, val, thatReader.readObject()) == 0) { matched = true; } } if (!matched) { return false; } } } } catch (RuntimeException ex) { //There are many differences which can cause an exception, easier to handle them this way return false; } return true; } /** * Provides some basic hashcode capabilities for the Block. This method has some draw backs in that it is difficult * to maintain as we add new types and becomes error prone when and slow if missused. This challenge is compounded * when understanding the right/wrong ways to use this are not easy to convey. */ @Override public int hashCode() { int hashcode = 0; for (Map.Entry next : this.schema.getCustomMetadata().entrySet()) { hashcode = hashcode + Objects.hashCode(next); } for (Field next : this.schema.getFields()) { FieldReader thisReader = vectorSchema.getVector(next.getName()).getReader(); for (int i = 0; i < this.vectorSchema.getRowCount(); i++) { thisReader.setPosition(i); hashcode = 31 * hashcode + Objects.hashCode(thisReader.readObject()); } } return hashcode; } @Override public String toString() { MoreObjects.ToStringHelper helper = MoreObjects.toStringHelper(this); helper.add("rows", getRowCount()); int rowsToPrint = this.vectorSchema.getRowCount() > 10 ? 10 : this.vectorSchema.getRowCount(); for (Field next : this.schema.getFields()) { FieldReader thisReader = vectorSchema.getVector(next.getName()).getReader(); List values = new ArrayList<>(); for (int i = 0; i < rowsToPrint; i++) { thisReader.setPosition(i); values.add(fieldToString(thisReader)); } helper.add(next.getName(), values); } return helper.toString(); } }