Search in sources :

Example 6 with VectorSchemaRoot

use of org.apache.arrow.vector.VectorSchemaRoot in project beam by apache.

the class BigQueryIOStorageReadTest method createResponseArrow.

private ReadRowsResponse createResponseArrow(org.apache.arrow.vector.types.pojo.Schema arrowSchema, List<String> name, List<Long> number, double progressAtResponseStart, double progressAtResponseEnd) {
    ArrowRecordBatch serializedRecord;
    try (VectorSchemaRoot schemaRoot = VectorSchemaRoot.create(arrowSchema, allocator)) {
        schemaRoot.allocateNew();
        schemaRoot.setRowCount(name.size());
        VarCharVector strVector = (VarCharVector) schemaRoot.getFieldVectors().get(0);
        BigIntVector bigIntVector = (BigIntVector) schemaRoot.getFieldVectors().get(1);
        for (int i = 0; i < name.size(); i++) {
            bigIntVector.set(i, number.get(i));
            strVector.set(i, new Text(name.get(i)));
        }
        VectorUnloader unLoader = new VectorUnloader(schemaRoot);
        try (org.apache.arrow.vector.ipc.message.ArrowRecordBatch records = unLoader.getRecordBatch()) {
            try (ByteArrayOutputStream os = new ByteArrayOutputStream()) {
                MessageSerializer.serialize(new WriteChannel(Channels.newChannel(os)), records);
                serializedRecord = ArrowRecordBatch.newBuilder().setRowCount(records.getLength()).setSerializedRecordBatch(ByteString.copyFrom(os.toByteArray())).build();
            } catch (IOException e) {
                throw new RuntimeException("Error writing to byte array output stream", e);
            }
        }
    }
    return ReadRowsResponse.newBuilder().setArrowRecordBatch(serializedRecord).setRowCount(name.size()).setStats(StreamStats.newBuilder().setProgress(Progress.newBuilder().setAtResponseStart(progressAtResponseStart).setAtResponseEnd(progressAtResponseEnd))).build();
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) VarCharVector(org.apache.arrow.vector.VarCharVector) Text(org.apache.arrow.vector.util.Text) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException) BigIntVector(org.apache.arrow.vector.BigIntVector) VectorUnloader(org.apache.arrow.vector.VectorUnloader) StatusRuntimeException(io.grpc.StatusRuntimeException) ArrowRecordBatch(com.google.cloud.bigquery.storage.v1.ArrowRecordBatch) WriteChannel(org.apache.arrow.vector.ipc.WriteChannel)

Example 7 with VectorSchemaRoot

use of org.apache.arrow.vector.VectorSchemaRoot in project textdb by TextDB.

the class PythonUDFOpExec method writeArrowStream.

/**
 * For every batch, the operator converts list of {@code Tuple}s into Arrow stream data in almost the exact same
 * way as it would when using Arrow file, except now it sends stream to the server with
 * {@link FlightClient#startPut(FlightDescriptor, VectorSchemaRoot, FlightClient.PutListener, CallOption...)} and
 * {@link FlightClient.ClientStreamListener#putNext()}. The server uses {@code do_put()} to receive data stream
 * and convert it into a {@code pyarrow.Table} and store it in the server.
 * {@code startPut} is a non-blocking call, but this method in general is a blocking call, it waits until all the
 * data are sent.
 *
 * @param client      The FlightClient that manages this.
 * @param values      The input queue that holds tuples, its contents will be consumed in this method.
 * @param arrowSchema Input Arrow table schema. This should already have been defined (converted).
 * @param channel     The predefined path that specifies where to store the data in Flight Serve.
 * @param chunkSize   The chunk size of the arrow stream. This is different than the batch size of the operator,
 *                    although they may seem similar. This doesn't actually affect serialization speed that much,
 *                    so in general it can be the same as {@code batchSize}.
 */
private void writeArrowStream(FlightClient client, Queue<Tuple> values, org.apache.arrow.vector.types.pojo.Schema arrowSchema, Channel channel, int chunkSize) throws RuntimeException {
    SyncPutListener flightListener = new SyncPutListener();
    VectorSchemaRoot schemaRoot = VectorSchemaRoot.create(arrowSchema, PythonUDFOpExec.memoryAllocator);
    FlightClient.ClientStreamListener streamWriter = client.startPut(FlightDescriptor.path(Collections.singletonList(channel.name)), schemaRoot, flightListener);
    try {
        while (!values.isEmpty()) {
            schemaRoot.allocateNew();
            while (schemaRoot.getRowCount() < chunkSize && !values.isEmpty()) {
                ArrowUtils.appendTexeraTuple(values.remove(), schemaRoot);
            }
            streamWriter.putNext();
            schemaRoot.clear();
        }
        streamWriter.completed();
        flightListener.getResult();
        flightListener.close();
        schemaRoot.clear();
    } catch (Exception e) {
        e.printStackTrace();
        closeAndThrow(client, e);
    }
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException)

Example 8 with VectorSchemaRoot

use of org.apache.arrow.vector.VectorSchemaRoot in project hive by apache.

the class LlapArrowBatchRecordReader method next.

@Override
public boolean next(NullWritable key, ArrowWrapperWritable value) throws IOException {
    try {
        // Need a way to know what thread to interrupt, since this is a blocking thread.
        setReaderThread(Thread.currentThread());
        boolean hasInput = arrowStreamReader.loadNextBatch();
        if (hasInput) {
            VectorSchemaRoot vectorSchemaRoot = arrowStreamReader.getVectorSchemaRoot();
            // There must be at least one column vector
            Preconditions.checkState(vectorSchemaRoot.getFieldVectors().size() > 0);
            // We should continue even if FieldVectors are empty. The next read might have the
            // data. We should stop only when loadNextBatch returns false.
            value.setVectorSchemaRoot(arrowStreamReader.getVectorSchemaRoot());
            return true;
        } else {
            processReaderEvent();
            return false;
        }
    } catch (IOException io) {
        failOnInterruption(io);
        return false;
    }
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) IOException(java.io.IOException)

Example 9 with VectorSchemaRoot

use of org.apache.arrow.vector.VectorSchemaRoot in project hive by apache.

the class Deserializer method deserialize.

public Object deserialize(Writable writable) {
    final ArrowWrapperWritable arrowWrapperWritable = (ArrowWrapperWritable) writable;
    final VectorSchemaRoot vectorSchemaRoot = arrowWrapperWritable.getVectorSchemaRoot();
    final List<FieldVector> fieldVectors = vectorSchemaRoot.getFieldVectors();
    final int fieldCount = fieldVectors.size();
    final int rowCount = vectorSchemaRoot.getFieldVectors().get(0).getValueCount();
    vectorizedRowBatch.ensureSize(rowCount);
    if (rows == null || rows.length < rowCount) {
        rows = new Object[rowCount][];
        for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) {
            rows[rowIndex] = new Object[fieldCount];
        }
    }
    for (int fieldIndex = 0; fieldIndex < fieldCount; fieldIndex++) {
        final FieldVector fieldVector = fieldVectors.get(fieldIndex);
        final int projectedCol = vectorizedRowBatch.projectedColumns[fieldIndex];
        final ColumnVector columnVector = vectorizedRowBatch.cols[projectedCol];
        final TypeInfo typeInfo = serDe.rowTypeInfo.getAllStructFieldTypeInfos().get(fieldIndex);
        read(fieldVector, columnVector, typeInfo);
    }
    for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) {
        vectorExtractRow.extractRow(vectorizedRowBatch, rowIndex, rows[rowIndex]);
    }
    vectorizedRowBatch.reset();
    return rows;
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) FieldVector(org.apache.arrow.vector.FieldVector) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) ArrowColumnarBatchSerDe.toStructListTypeInfo(org.apache.hadoop.hive.ql.io.arrow.ArrowColumnarBatchSerDe.toStructListTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) UnionTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo) DecimalColumnVector(org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) MapColumnVector(org.apache.hadoop.hive.ql.exec.vector.MapColumnVector) VectorizedBatchUtil.createColumnVector(org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil.createColumnVector) TimestampColumnVector(org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector) UnionColumnVector(org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector) IntervalDayTimeColumnVector(org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)

Example 10 with VectorSchemaRoot

use of org.apache.arrow.vector.VectorSchemaRoot in project hive by apache.

the class Serializer method serializeBatch.

// Used for both:
// 1. VectorizedRowBatch constructed by batching rows
// 2. VectorizedRowBatch provided from upstream (isNative)
public ArrowWrapperWritable serializeBatch(VectorizedRowBatch vectorizedRowBatch, boolean isNative) {
    rootVector.setValueCount(0);
    for (int fieldIndex = 0; fieldIndex < vectorizedRowBatch.projectionSize; fieldIndex++) {
        final int projectedColumn = vectorizedRowBatch.projectedColumns[fieldIndex];
        final ColumnVector hiveVector = vectorizedRowBatch.cols[projectedColumn];
        final TypeInfo fieldTypeInfo = fieldTypeInfos.get(fieldIndex);
        final String fieldName = fieldNames.get(fieldIndex);
        final FieldType fieldType = toFieldType(fieldTypeInfo);
        // Reuse existing FieldVector buffers
        // since we always call setValue or setNull for each row
        boolean fieldExists = false;
        if (rootVector.getChild(fieldName) != null) {
            fieldExists = true;
        }
        final FieldVector arrowVector = rootVector.addOrGet(fieldName, fieldType, FieldVector.class);
        if (fieldExists) {
            arrowVector.setValueCount(isNative ? vectorizedRowBatch.size : batchSize);
        } else {
            arrowVector.setInitialCapacity(isNative ? vectorizedRowBatch.size : batchSize);
            arrowVector.allocateNew();
        }
        write(arrowVector, hiveVector, fieldTypeInfo, isNative ? vectorizedRowBatch.size : batchSize, vectorizedRowBatch, isNative);
    }
    if (!isNative) {
        // Only mutate batches that are constructed by this serde
        vectorizedRowBatch.reset();
        rootVector.setValueCount(batchSize);
    } else {
        rootVector.setValueCount(vectorizedRowBatch.size);
    }
    batchSize = 0;
    VectorSchemaRoot vectorSchemaRoot = new VectorSchemaRoot(rootVector);
    return new ArrowWrapperWritable(vectorSchemaRoot, allocator, rootVector);
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) FieldVector(org.apache.arrow.vector.FieldVector) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) ArrowColumnarBatchSerDe.toStructListTypeInfo(org.apache.hadoop.hive.ql.io.arrow.ArrowColumnarBatchSerDe.toStructListTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) UnionTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo) CharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo) DecimalColumnVector(org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) Decimal64ColumnVector(org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector) DateColumnVector(org.apache.hadoop.hive.ql.exec.vector.DateColumnVector) ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) StructColumnVector(org.apache.hadoop.hive.ql.exec.vector.StructColumnVector) MapColumnVector(org.apache.hadoop.hive.ql.exec.vector.MapColumnVector) VectorizedBatchUtil.createColumnVector(org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil.createColumnVector) TimestampColumnVector(org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector) MultiValuedColumnVector(org.apache.hadoop.hive.ql.exec.vector.MultiValuedColumnVector) UnionColumnVector(org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector) IntervalDayTimeColumnVector(org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector) FieldType(org.apache.arrow.vector.types.pojo.FieldType)

Aggregations

VectorSchemaRoot (org.apache.arrow.vector.VectorSchemaRoot)15 RowData (org.apache.flink.table.data.RowData)4 ByteArrayOutputStream (java.io.ByteArrayOutputStream)3 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 FieldVector (org.apache.arrow.vector.FieldVector)3 VarCharVector (org.apache.arrow.vector.VarCharVector)3 GenericRowData (org.apache.flink.table.data.GenericRowData)3 Test (org.junit.Test)3 Float8Vector (org.apache.arrow.vector.Float8Vector)2 IntVector (org.apache.arrow.vector.IntVector)2 ArrowStreamWriter (org.apache.arrow.vector.ipc.ArrowStreamWriter)2 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)2 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)2 DecimalColumnVector (org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector)2 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)2 IntervalDayTimeColumnVector (org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector)2 ListColumnVector (org.apache.hadoop.hive.ql.exec.vector.ListColumnVector)2 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)2 MapColumnVector (org.apache.hadoop.hive.ql.exec.vector.MapColumnVector)2