use of org.apache.arrow.vector.VectorSchemaRoot in project beam by apache.
the class BigQueryIOStorageReadTest method createResponseArrow.
private ReadRowsResponse createResponseArrow(org.apache.arrow.vector.types.pojo.Schema arrowSchema, List<String> name, List<Long> number, double progressAtResponseStart, double progressAtResponseEnd) {
ArrowRecordBatch serializedRecord;
try (VectorSchemaRoot schemaRoot = VectorSchemaRoot.create(arrowSchema, allocator)) {
schemaRoot.allocateNew();
schemaRoot.setRowCount(name.size());
VarCharVector strVector = (VarCharVector) schemaRoot.getFieldVectors().get(0);
BigIntVector bigIntVector = (BigIntVector) schemaRoot.getFieldVectors().get(1);
for (int i = 0; i < name.size(); i++) {
bigIntVector.set(i, number.get(i));
strVector.set(i, new Text(name.get(i)));
}
VectorUnloader unLoader = new VectorUnloader(schemaRoot);
try (org.apache.arrow.vector.ipc.message.ArrowRecordBatch records = unLoader.getRecordBatch()) {
try (ByteArrayOutputStream os = new ByteArrayOutputStream()) {
MessageSerializer.serialize(new WriteChannel(Channels.newChannel(os)), records);
serializedRecord = ArrowRecordBatch.newBuilder().setRowCount(records.getLength()).setSerializedRecordBatch(ByteString.copyFrom(os.toByteArray())).build();
} catch (IOException e) {
throw new RuntimeException("Error writing to byte array output stream", e);
}
}
}
return ReadRowsResponse.newBuilder().setArrowRecordBatch(serializedRecord).setRowCount(name.size()).setStats(StreamStats.newBuilder().setProgress(Progress.newBuilder().setAtResponseStart(progressAtResponseStart).setAtResponseEnd(progressAtResponseEnd))).build();
}
use of org.apache.arrow.vector.VectorSchemaRoot in project textdb by TextDB.
the class PythonUDFOpExec method writeArrowStream.
/**
* For every batch, the operator converts list of {@code Tuple}s into Arrow stream data in almost the exact same
* way as it would when using Arrow file, except now it sends stream to the server with
* {@link FlightClient#startPut(FlightDescriptor, VectorSchemaRoot, FlightClient.PutListener, CallOption...)} and
* {@link FlightClient.ClientStreamListener#putNext()}. The server uses {@code do_put()} to receive data stream
* and convert it into a {@code pyarrow.Table} and store it in the server.
* {@code startPut} is a non-blocking call, but this method in general is a blocking call, it waits until all the
* data are sent.
*
* @param client The FlightClient that manages this.
* @param values The input queue that holds tuples, its contents will be consumed in this method.
* @param arrowSchema Input Arrow table schema. This should already have been defined (converted).
* @param channel The predefined path that specifies where to store the data in Flight Serve.
* @param chunkSize The chunk size of the arrow stream. This is different than the batch size of the operator,
* although they may seem similar. This doesn't actually affect serialization speed that much,
* so in general it can be the same as {@code batchSize}.
*/
private void writeArrowStream(FlightClient client, Queue<Tuple> values, org.apache.arrow.vector.types.pojo.Schema arrowSchema, Channel channel, int chunkSize) throws RuntimeException {
SyncPutListener flightListener = new SyncPutListener();
VectorSchemaRoot schemaRoot = VectorSchemaRoot.create(arrowSchema, PythonUDFOpExec.memoryAllocator);
FlightClient.ClientStreamListener streamWriter = client.startPut(FlightDescriptor.path(Collections.singletonList(channel.name)), schemaRoot, flightListener);
try {
while (!values.isEmpty()) {
schemaRoot.allocateNew();
while (schemaRoot.getRowCount() < chunkSize && !values.isEmpty()) {
ArrowUtils.appendTexeraTuple(values.remove(), schemaRoot);
}
streamWriter.putNext();
schemaRoot.clear();
}
streamWriter.completed();
flightListener.getResult();
flightListener.close();
schemaRoot.clear();
} catch (Exception e) {
e.printStackTrace();
closeAndThrow(client, e);
}
}
use of org.apache.arrow.vector.VectorSchemaRoot in project hive by apache.
the class LlapArrowBatchRecordReader method next.
@Override
public boolean next(NullWritable key, ArrowWrapperWritable value) throws IOException {
try {
// Need a way to know what thread to interrupt, since this is a blocking thread.
setReaderThread(Thread.currentThread());
boolean hasInput = arrowStreamReader.loadNextBatch();
if (hasInput) {
VectorSchemaRoot vectorSchemaRoot = arrowStreamReader.getVectorSchemaRoot();
// There must be at least one column vector
Preconditions.checkState(vectorSchemaRoot.getFieldVectors().size() > 0);
// We should continue even if FieldVectors are empty. The next read might have the
// data. We should stop only when loadNextBatch returns false.
value.setVectorSchemaRoot(arrowStreamReader.getVectorSchemaRoot());
return true;
} else {
processReaderEvent();
return false;
}
} catch (IOException io) {
failOnInterruption(io);
return false;
}
}
use of org.apache.arrow.vector.VectorSchemaRoot in project hive by apache.
the class Deserializer method deserialize.
public Object deserialize(Writable writable) {
final ArrowWrapperWritable arrowWrapperWritable = (ArrowWrapperWritable) writable;
final VectorSchemaRoot vectorSchemaRoot = arrowWrapperWritable.getVectorSchemaRoot();
final List<FieldVector> fieldVectors = vectorSchemaRoot.getFieldVectors();
final int fieldCount = fieldVectors.size();
final int rowCount = vectorSchemaRoot.getFieldVectors().get(0).getValueCount();
vectorizedRowBatch.ensureSize(rowCount);
if (rows == null || rows.length < rowCount) {
rows = new Object[rowCount][];
for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) {
rows[rowIndex] = new Object[fieldCount];
}
}
for (int fieldIndex = 0; fieldIndex < fieldCount; fieldIndex++) {
final FieldVector fieldVector = fieldVectors.get(fieldIndex);
final int projectedCol = vectorizedRowBatch.projectedColumns[fieldIndex];
final ColumnVector columnVector = vectorizedRowBatch.cols[projectedCol];
final TypeInfo typeInfo = serDe.rowTypeInfo.getAllStructFieldTypeInfos().get(fieldIndex);
read(fieldVector, columnVector, typeInfo);
}
for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) {
vectorExtractRow.extractRow(vectorizedRowBatch, rowIndex, rows[rowIndex]);
}
vectorizedRowBatch.reset();
return rows;
}
use of org.apache.arrow.vector.VectorSchemaRoot in project hive by apache.
the class Serializer method serializeBatch.
// Used for both:
// 1. VectorizedRowBatch constructed by batching rows
// 2. VectorizedRowBatch provided from upstream (isNative)
public ArrowWrapperWritable serializeBatch(VectorizedRowBatch vectorizedRowBatch, boolean isNative) {
rootVector.setValueCount(0);
for (int fieldIndex = 0; fieldIndex < vectorizedRowBatch.projectionSize; fieldIndex++) {
final int projectedColumn = vectorizedRowBatch.projectedColumns[fieldIndex];
final ColumnVector hiveVector = vectorizedRowBatch.cols[projectedColumn];
final TypeInfo fieldTypeInfo = fieldTypeInfos.get(fieldIndex);
final String fieldName = fieldNames.get(fieldIndex);
final FieldType fieldType = toFieldType(fieldTypeInfo);
// Reuse existing FieldVector buffers
// since we always call setValue or setNull for each row
boolean fieldExists = false;
if (rootVector.getChild(fieldName) != null) {
fieldExists = true;
}
final FieldVector arrowVector = rootVector.addOrGet(fieldName, fieldType, FieldVector.class);
if (fieldExists) {
arrowVector.setValueCount(isNative ? vectorizedRowBatch.size : batchSize);
} else {
arrowVector.setInitialCapacity(isNative ? vectorizedRowBatch.size : batchSize);
arrowVector.allocateNew();
}
write(arrowVector, hiveVector, fieldTypeInfo, isNative ? vectorizedRowBatch.size : batchSize, vectorizedRowBatch, isNative);
}
if (!isNative) {
// Only mutate batches that are constructed by this serde
vectorizedRowBatch.reset();
rootVector.setValueCount(batchSize);
} else {
rootVector.setValueCount(vectorizedRowBatch.size);
}
batchSize = 0;
VectorSchemaRoot vectorSchemaRoot = new VectorSchemaRoot(rootVector);
return new ArrowWrapperWritable(vectorSchemaRoot, allocator, rootVector);
}
Aggregations