use of org.apache.arrow.vector.VectorSchemaRoot in project flink by apache.
the class ArrowUtils method collectAsPandasDataFrame.
/**
* Convert Flink table to Pandas DataFrame.
*/
public static CustomIterator<byte[]> collectAsPandasDataFrame(Table table, int maxArrowBatchSize) throws Exception {
checkArrowUsable();
BufferAllocator allocator = getRootAllocator().newChildAllocator("collectAsPandasDataFrame", 0, Long.MAX_VALUE);
RowType rowType = (RowType) table.getResolvedSchema().toSourceRowDataType().getLogicalType();
DataType defaultRowDataType = TypeConversions.fromLogicalToDataType(rowType);
VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ArrowStreamWriter arrowStreamWriter = new ArrowStreamWriter(root, null, baos);
arrowStreamWriter.start();
Iterator<Row> results = table.execute().collect();
Iterator<Row> appendOnlyResults;
if (isAppendOnlyTable(table)) {
appendOnlyResults = results;
} else {
appendOnlyResults = filterOutRetractRows(results);
}
ArrowWriter arrowWriter = createRowDataArrowWriter(root, rowType);
Iterator convertedResults = new Iterator<RowData>() {
@Override
public boolean hasNext() {
return appendOnlyResults.hasNext();
}
@Override
public RowData next() {
DataFormatConverters.DataFormatConverter converter = DataFormatConverters.getConverterForDataType(defaultRowDataType);
return (RowData) converter.toInternal(appendOnlyResults.next());
}
};
return new CustomIterator<byte[]>() {
@Override
public boolean hasNext() {
return convertedResults.hasNext();
}
@Override
public byte[] next() {
try {
int i = 0;
while (convertedResults.hasNext() && i < maxArrowBatchSize) {
i++;
arrowWriter.write(convertedResults.next());
}
arrowWriter.finish();
arrowStreamWriter.writeBatch();
return baos.toByteArray();
} catch (Throwable t) {
String msg = "Failed to serialize the data of the table";
LOG.error(msg, t);
throw new RuntimeException(msg, t);
} finally {
arrowWriter.reset();
baos.reset();
if (!hasNext()) {
root.close();
allocator.close();
}
}
}
};
}
use of org.apache.arrow.vector.VectorSchemaRoot in project flink by apache.
the class ArrowSerializer method load.
public int load() throws IOException {
arrowStreamReader.loadNextBatch();
VectorSchemaRoot root = arrowStreamReader.getVectorSchemaRoot();
if (arrowReader == null) {
arrowReader = createArrowReader(root);
}
return root.getRowCount();
}
use of org.apache.arrow.vector.VectorSchemaRoot in project flink by apache.
the class ArrowReaderWriterTest method createArrowWriter.
@Override
public Tuple2<ArrowWriter<RowData>, ArrowStreamWriter> createArrowWriter(OutputStream outputStream) throws IOException {
VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
ArrowWriter<RowData> arrowWriter = ArrowUtils.createRowDataArrowWriter(root, rowType);
ArrowStreamWriter arrowStreamWriter = new ArrowStreamWriter(root, null, outputStream);
arrowStreamWriter.start();
return Tuple2.of(arrowWriter, arrowStreamWriter);
}
use of org.apache.arrow.vector.VectorSchemaRoot in project flink by apache.
the class ArrowUtilsTest method testCreateArrowReader.
@Test
public void testCreateArrowReader() {
VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
ArrowReader reader = ArrowUtils.createArrowReader(root, rowType);
ColumnVector[] columnVectors = reader.getColumnVectors();
for (int i = 0; i < columnVectors.length; i++) {
assertEquals(testFields.get(i).f4, columnVectors[i].getClass());
}
}
use of org.apache.arrow.vector.VectorSchemaRoot in project beam by apache.
the class ArrowConversion method rowsFromSerializedRecordBatch.
@SuppressWarnings("nullness")
public static RecordBatchRowIterator rowsFromSerializedRecordBatch(org.apache.arrow.vector.types.pojo.Schema arrowSchema, InputStream inputStream, RootAllocator allocator) throws IOException {
VectorSchemaRoot vectorRoot = VectorSchemaRoot.create(arrowSchema, allocator);
VectorLoader vectorLoader = new VectorLoader(vectorRoot);
vectorRoot.clear();
try (ReadChannel read = new ReadChannel(Channels.newChannel(inputStream))) {
try (ArrowRecordBatch arrowMessage = MessageSerializer.deserializeRecordBatch(read, allocator)) {
vectorLoader.load(arrowMessage);
}
}
return rowsFromRecordBatch(ArrowSchemaTranslator.toBeamSchema(arrowSchema), vectorRoot);
}
Aggregations