Search in sources :

Example 1 with ColumnVector

use of org.apache.spark.sql.vectorized.ColumnVector in project iceberg by apache.

the class ColumnarBatchReader method read.

@Override
public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) {
    Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead);
    ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length];
    if (reuse == null) {
        closeVectors();
    }
    for (int i = 0; i < readers.length; i += 1) {
        vectorHolders[i] = readers[i].read(vectorHolders[i], numRowsToRead);
        int numRowsInVector = vectorHolders[i].numValues();
        Preconditions.checkState(numRowsInVector == numRowsToRead, "Number of rows in the vector %s didn't match expected %s ", numRowsInVector, numRowsToRead);
        arrowColumnVectors[i] = IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector);
    }
    ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors);
    batch.setNumRows(numRowsToRead);
    return batch;
}
Also used : ColumnarBatch(org.apache.spark.sql.vectorized.ColumnarBatch) ColumnVector(org.apache.spark.sql.vectorized.ColumnVector)

Example 2 with ColumnVector

use of org.apache.spark.sql.vectorized.ColumnVector in project iceberg by apache.

the class TestHelpers method assertEqualsBatch.

public static void assertEqualsBatch(Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch, boolean checkArrowValidityVector) {
    for (int rowId = 0; rowId < batch.numRows(); rowId++) {
        List<Types.NestedField> fields = struct.fields();
        InternalRow row = batch.getRow(rowId);
        Record rec = expected.next();
        for (int i = 0; i < fields.size(); i += 1) {
            Type fieldType = fields.get(i).type();
            Object expectedValue = rec.get(i);
            Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType));
            assertEqualsUnsafe(fieldType, expectedValue, actualValue);
            if (checkArrowValidityVector) {
                ColumnVector columnVector = batch.column(i);
                ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector();
                Assert.assertFalse("Nullability doesn't match of " + columnVector.dataType(), expectedValue == null ^ arrowVector.isNull(rowId));
            }
        }
    }
}
Also used : ValueVector(org.apache.arrow.vector.ValueVector) BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) StructType(org.apache.spark.sql.types.StructType) Type(org.apache.iceberg.types.Type) ArrayType(org.apache.spark.sql.types.ArrayType) MapType(org.apache.spark.sql.types.MapType) Record(org.apache.avro.generic.GenericData.Record) InternalRow(org.apache.spark.sql.catalyst.InternalRow) ColumnVector(org.apache.spark.sql.vectorized.ColumnVector) IcebergArrowColumnVector(org.apache.iceberg.spark.data.vectorized.IcebergArrowColumnVector)

Example 3 with ColumnVector

use of org.apache.spark.sql.vectorized.ColumnVector in project spark-bigquery-connector by GoogleCloudDataproc.

the class ArrowReaderIterator method toArrowRows.

private Iterator<InternalRow> toArrowRows(VectorSchemaRoot root, List<String> namesInOrder) {
    ColumnVector[] columns = namesInOrder.stream().map(name -> root.getVector(name)).map(vector -> new ArrowSchemaConverter(vector, userProvidedFieldMap.get(vector.getName()))).collect(Collectors.toList()).toArray(new ColumnVector[0]);
    ColumnarBatch batch = new ColumnarBatch(columns);
    batch.setNumRows(root.getRowCount());
    return batch.rowIterator();
}
Also used : Arrays(java.util.Arrays) InternalRow(org.apache.spark.sql.catalyst.InternalRow) LoggerFactory(org.slf4j.LoggerFactory) Function(java.util.function.Function) ImmutableList(com.google.common.collect.ImmutableList) ByteArrayInputStream(java.io.ByteArrayInputStream) Map(java.util.Map) ArrowStreamReader(org.apache.arrow.vector.ipc.ArrowStreamReader) BufferAllocator(org.apache.arrow.memory.BufferAllocator) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) ArrowReader(org.apache.arrow.vector.ipc.ArrowReader) ColumnVector(org.apache.spark.sql.vectorized.ColumnVector) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) SequenceInputStream(java.io.SequenceInputStream) CommonsCompressionFactory(org.apache.arrow.compression.CommonsCompressionFactory) VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) IOException(java.io.IOException) ArrowUtil(com.google.cloud.bigquery.connector.common.ArrowUtil) Collectors(java.util.stream.Collectors) ByteString(com.google.protobuf.ByteString) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) ColumnarBatch(org.apache.spark.sql.vectorized.ColumnarBatch) Optional(java.util.Optional) ColumnarBatch(org.apache.spark.sql.vectorized.ColumnarBatch) ColumnVector(org.apache.spark.sql.vectorized.ColumnVector)

Example 4 with ColumnVector

use of org.apache.spark.sql.vectorized.ColumnVector in project spark-bigquery-connector by GoogleCloudDataproc.

the class ArrowColumnBatchPartitionReaderContext method next.

public boolean next() throws IOException {
    tracer.nextBatchNeeded();
    if (closed) {
        return false;
    }
    tracer.rowsParseStarted();
    closed = !reader.loadNextBatch();
    if (closed) {
        return false;
    }
    VectorSchemaRoot root = reader.root();
    if (currentBatch == null) {
        // trying to verify from dev@spark but this object
        // should only need to get created once.  The underlying
        // vectors should stay the same.
        ColumnVector[] columns = namesInOrder.stream().map(root::getVector).map(vector -> new ArrowSchemaConverter(vector, userProvidedFieldMap.get(vector.getName()))).toArray(ColumnVector[]::new);
        currentBatch = new ColumnarBatch(columns);
    }
    currentBatch.setNumRows(root.getRowCount());
    tracer.rowsParseFinished(currentBatch.numRows());
    return true;
}
Also used : VectorLoader(org.apache.arrow.vector.VectorLoader) MoreExecutors(com.google.common.util.concurrent.MoreExecutors) Arrays(java.util.Arrays) Schema(org.apache.arrow.vector.types.pojo.Schema) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) ReadRowsResponse(com.google.cloud.bigquery.storage.v1.ReadRowsResponse) ArrowSchemaConverter(com.google.cloud.spark.bigquery.ArrowSchemaConverter) ArrayList(java.util.ArrayList) IteratorMultiplexer(com.google.cloud.bigquery.connector.common.IteratorMultiplexer) ParallelArrowReader(com.google.cloud.bigquery.connector.common.ParallelArrowReader) ImmutableList(com.google.common.collect.ImmutableList) Map(java.util.Map) AutoCloseables(org.apache.arrow.util.AutoCloseables) ArrowStreamReader(org.apache.arrow.vector.ipc.ArrowStreamReader) ExecutorService(java.util.concurrent.ExecutorService) BufferAllocator(org.apache.arrow.memory.BufferAllocator) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) NonInterruptibleBlockingBytesChannel(com.google.cloud.bigquery.connector.common.NonInterruptibleBlockingBytesChannel) ArrowReader(org.apache.arrow.vector.ipc.ArrowReader) ColumnVector(org.apache.spark.sql.vectorized.ColumnVector) Iterator(java.util.Iterator) ReadRowsResponseInputStreamEnumeration(com.google.cloud.bigquery.connector.common.ReadRowsResponseInputStreamEnumeration) SynchronousQueue(java.util.concurrent.SynchronousQueue) SequenceInputStream(java.io.SequenceInputStream) CommonsCompressionFactory(org.apache.arrow.compression.CommonsCompressionFactory) VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) IOException(java.io.IOException) ArrowUtil(com.google.cloud.bigquery.connector.common.ArrowUtil) Collectors(java.util.stream.Collectors) ByteString(com.google.protobuf.ByteString) TimeUnit(java.util.concurrent.TimeUnit) BigQueryStorageReadRowsTracer(com.google.cloud.bigquery.connector.common.BigQueryStorageReadRowsTracer) List(java.util.List) ColumnarBatch(org.apache.spark.sql.vectorized.ColumnarBatch) Optional(java.util.Optional) ReadRowsHelper(com.google.cloud.bigquery.connector.common.ReadRowsHelper) InputStream(java.io.InputStream) VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) ColumnarBatch(org.apache.spark.sql.vectorized.ColumnarBatch) ColumnVector(org.apache.spark.sql.vectorized.ColumnVector) ArrowSchemaConverter(com.google.cloud.spark.bigquery.ArrowSchemaConverter)

Example 5 with ColumnVector

use of org.apache.spark.sql.vectorized.ColumnVector in project TileDB-Spark by TileDB-Inc.

the class TileDBDataReaderPartitionScan method get.

@Override
public ColumnarBatch get() {
    metricsUpdater.startTimer(queryGetTimerName);
    try {
        int nRows = (int) currentNumRecords;
        if (resultBatch == null) {
            ColumnVector[] colVecs = new ColumnVector[valueVectors.size()];
            for (int i = 0; i < valueVectors.size(); i++) {
                String name = fieldNames.get(i);
                TypeInfo typeInfo = getTypeInfo(name);
                boolean isDateType = typeInfo.multiplier != 1 || typeInfo.moreThanDay;
                // if nullable
                if (typeInfo.isNullable) {
                    // If the attribute is nullable we need to set the validity buffer from the main value
                    // vector in bitmap fashion.
                    // TileDB handles the bitmap as a bytemap, thus the following conversion.
                    ArrowBuf arrowBufValidity = valueVectors.get(i).getValidityBuffer();
                    ArrowBuf validityByteBuffer = validityValueVectors.get(i).getDataBuffer();
                    for (int j = 0; j < nRows; j++) {
                        if (validityByteBuffer.getByte(j) == (byte) 0) {
                            BitVectorHelper.setValidityBit(arrowBufValidity, j, 0);
                        }
                    }
                }
                if (isDateType) {
                    if (typeInfo.isVarLen)
                        throw new TileDBError("Var length attributes/dimensions of type TILEDB_DATETIME_* are not currently supported: " + name);
                    // it means that the datatype is Date and the values need filtering to
                    // accommodate for the fewer datatypes that spark provides compared to TileDB.
                    filterDataBufferForDateTypes(valueVectors.get(i).getDataBuffer(), currentNumRecords, typeInfo);
                }
                colVecs[i] = new ArrowColumnVector(valueVectors.get(i));
            }
            resultBatch = new ColumnarBatch(colVecs);
        }
        resultBatch.setNumRows(nRows);
        // Note that calculateNativeArrayByteSizes() might not be
        this.metricsUpdater.updateTaskMetrics(nRows, calculateResultByteSize());
    } catch (TileDBError err) {
        throw new RuntimeException(err.getMessage());
    }
    metricsUpdater.finish(queryGetTimerName);
    return resultBatch;
}
Also used : ArrowBuf(io.netty.buffer.ArrowBuf) ColumnarBatch(org.apache.spark.sql.vectorized.ColumnarBatch) ArrowColumnVector(org.apache.spark.sql.vectorized.ArrowColumnVector) ArrowColumnVector(org.apache.spark.sql.vectorized.ArrowColumnVector) OnHeapColumnVector(org.apache.spark.sql.execution.vectorized.OnHeapColumnVector) ColumnVector(org.apache.spark.sql.vectorized.ColumnVector)

Aggregations

ColumnVector (org.apache.spark.sql.vectorized.ColumnVector)7 ColumnarBatch (org.apache.spark.sql.vectorized.ColumnarBatch)6 StructType (org.apache.spark.sql.types.StructType)3 ArrowUtil (com.google.cloud.bigquery.connector.common.ArrowUtil)2 ImmutableList (com.google.common.collect.ImmutableList)2 ByteString (com.google.protobuf.ByteString)2 IOException (java.io.IOException)2 SequenceInputStream (java.io.SequenceInputStream)2 Arrays (java.util.Arrays)2 Iterator (java.util.Iterator)2 List (java.util.List)2 Map (java.util.Map)2 Optional (java.util.Optional)2 Collectors (java.util.stream.Collectors)2 CommonsCompressionFactory (org.apache.arrow.compression.CommonsCompressionFactory)2 BufferAllocator (org.apache.arrow.memory.BufferAllocator)2 VectorSchemaRoot (org.apache.arrow.vector.VectorSchemaRoot)2 ArrowReader (org.apache.arrow.vector.ipc.ArrowReader)2 ArrowStreamReader (org.apache.arrow.vector.ipc.ArrowStreamReader)2 InternalRow (org.apache.spark.sql.catalyst.InternalRow)2