Search in sources :

Example 1 with VectorSchemaRoot

use of org.apache.arrow.vector.VectorSchemaRoot in project flink by apache.

the class ArrowUtils method collectAsPandasDataFrame.

/**
 * Convert Flink table to Pandas DataFrame.
 */
public static CustomIterator<byte[]> collectAsPandasDataFrame(Table table, int maxArrowBatchSize) throws Exception {
    checkArrowUsable();
    BufferAllocator allocator = getRootAllocator().newChildAllocator("collectAsPandasDataFrame", 0, Long.MAX_VALUE);
    RowType rowType = (RowType) table.getResolvedSchema().toSourceRowDataType().getLogicalType();
    DataType defaultRowDataType = TypeConversions.fromLogicalToDataType(rowType);
    VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    ArrowStreamWriter arrowStreamWriter = new ArrowStreamWriter(root, null, baos);
    arrowStreamWriter.start();
    Iterator<Row> results = table.execute().collect();
    Iterator<Row> appendOnlyResults;
    if (isAppendOnlyTable(table)) {
        appendOnlyResults = results;
    } else {
        appendOnlyResults = filterOutRetractRows(results);
    }
    ArrowWriter arrowWriter = createRowDataArrowWriter(root, rowType);
    Iterator convertedResults = new Iterator<RowData>() {

        @Override
        public boolean hasNext() {
            return appendOnlyResults.hasNext();
        }

        @Override
        public RowData next() {
            DataFormatConverters.DataFormatConverter converter = DataFormatConverters.getConverterForDataType(defaultRowDataType);
            return (RowData) converter.toInternal(appendOnlyResults.next());
        }
    };
    return new CustomIterator<byte[]>() {

        @Override
        public boolean hasNext() {
            return convertedResults.hasNext();
        }

        @Override
        public byte[] next() {
            try {
                int i = 0;
                while (convertedResults.hasNext() && i < maxArrowBatchSize) {
                    i++;
                    arrowWriter.write(convertedResults.next());
                }
                arrowWriter.finish();
                arrowStreamWriter.writeBatch();
                return baos.toByteArray();
            } catch (Throwable t) {
                String msg = "Failed to serialize the data of the table";
                LOG.error(msg, t);
                throw new RuntimeException(msg, t);
            } finally {
                arrowWriter.reset();
                baos.reset();
                if (!hasNext()) {
                    root.close();
                    allocator.close();
                }
            }
        }
    };
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) RowType(org.apache.flink.table.types.logical.RowType) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ArrowStreamWriter(org.apache.arrow.vector.ipc.ArrowStreamWriter) BufferAllocator(org.apache.arrow.memory.BufferAllocator) RowData(org.apache.flink.table.data.RowData) DataFormatConverters(org.apache.flink.table.data.util.DataFormatConverters) Iterator(java.util.Iterator) DataType(org.apache.flink.table.types.DataType) Row(org.apache.flink.types.Row)

Example 2 with VectorSchemaRoot

use of org.apache.arrow.vector.VectorSchemaRoot in project flink by apache.

the class ArrowSerializer method load.

public int load() throws IOException {
    arrowStreamReader.loadNextBatch();
    VectorSchemaRoot root = arrowStreamReader.getVectorSchemaRoot();
    if (arrowReader == null) {
        arrowReader = createArrowReader(root);
    }
    return root.getRowCount();
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot)

Example 3 with VectorSchemaRoot

use of org.apache.arrow.vector.VectorSchemaRoot in project flink by apache.

the class ArrowReaderWriterTest method createArrowWriter.

@Override
public Tuple2<ArrowWriter<RowData>, ArrowStreamWriter> createArrowWriter(OutputStream outputStream) throws IOException {
    VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
    ArrowWriter<RowData> arrowWriter = ArrowUtils.createRowDataArrowWriter(root, rowType);
    ArrowStreamWriter arrowStreamWriter = new ArrowStreamWriter(root, null, outputStream);
    arrowStreamWriter.start();
    return Tuple2.of(arrowWriter, arrowStreamWriter);
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) GenericRowData(org.apache.flink.table.data.GenericRowData) RowData(org.apache.flink.table.data.RowData) BinaryRowData(org.apache.flink.table.data.binary.BinaryRowData) ArrowStreamWriter(org.apache.arrow.vector.ipc.ArrowStreamWriter)

Example 4 with VectorSchemaRoot

use of org.apache.arrow.vector.VectorSchemaRoot in project flink by apache.

the class ArrowUtilsTest method testCreateArrowReader.

@Test
public void testCreateArrowReader() {
    VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
    ArrowReader reader = ArrowUtils.createArrowReader(root, rowType);
    ColumnVector[] columnVectors = reader.getColumnVectors();
    for (int i = 0; i < columnVectors.length; i++) {
        assertEquals(testFields.get(i).f4, columnVectors[i].getClass());
    }
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) ArrowDecimalColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowDecimalColumnVector) ArrowBigIntColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowBigIntColumnVector) ArrowVarBinaryColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowVarBinaryColumnVector) ArrowVarCharColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowVarCharColumnVector) ArrowTimestampColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowTimestampColumnVector) ColumnVector(org.apache.flink.table.data.columnar.vector.ColumnVector) ArrowIntColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowIntColumnVector) ArrowRowColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowRowColumnVector) ArrowSmallIntColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowSmallIntColumnVector) ArrowFloatColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowFloatColumnVector) ArrowDoubleColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowDoubleColumnVector) ArrowTinyIntColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowTinyIntColumnVector) ArrowArrayColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowArrayColumnVector) ArrowDateColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowDateColumnVector) ArrowBooleanColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowBooleanColumnVector) ArrowTimeColumnVector(org.apache.flink.table.runtime.arrow.vectors.ArrowTimeColumnVector) Test(org.junit.Test)

Example 5 with VectorSchemaRoot

use of org.apache.arrow.vector.VectorSchemaRoot in project beam by apache.

the class ArrowConversion method rowsFromSerializedRecordBatch.

@SuppressWarnings("nullness")
public static RecordBatchRowIterator rowsFromSerializedRecordBatch(org.apache.arrow.vector.types.pojo.Schema arrowSchema, InputStream inputStream, RootAllocator allocator) throws IOException {
    VectorSchemaRoot vectorRoot = VectorSchemaRoot.create(arrowSchema, allocator);
    VectorLoader vectorLoader = new VectorLoader(vectorRoot);
    vectorRoot.clear();
    try (ReadChannel read = new ReadChannel(Channels.newChannel(inputStream))) {
        try (ArrowRecordBatch arrowMessage = MessageSerializer.deserializeRecordBatch(read, allocator)) {
            vectorLoader.load(arrowMessage);
        }
    }
    return rowsFromRecordBatch(ArrowSchemaTranslator.toBeamSchema(arrowSchema), vectorRoot);
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) VectorLoader(org.apache.arrow.vector.VectorLoader) ArrowRecordBatch(org.apache.arrow.vector.ipc.message.ArrowRecordBatch) ReadChannel(org.apache.arrow.vector.ipc.ReadChannel)

Aggregations

VectorSchemaRoot (org.apache.arrow.vector.VectorSchemaRoot)15 RowData (org.apache.flink.table.data.RowData)4 ByteArrayOutputStream (java.io.ByteArrayOutputStream)3 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 FieldVector (org.apache.arrow.vector.FieldVector)3 VarCharVector (org.apache.arrow.vector.VarCharVector)3 GenericRowData (org.apache.flink.table.data.GenericRowData)3 Test (org.junit.Test)3 Float8Vector (org.apache.arrow.vector.Float8Vector)2 IntVector (org.apache.arrow.vector.IntVector)2 ArrowStreamWriter (org.apache.arrow.vector.ipc.ArrowStreamWriter)2 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)2 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)2 DecimalColumnVector (org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector)2 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)2 IntervalDayTimeColumnVector (org.apache.hadoop.hive.ql.exec.vector.IntervalDayTimeColumnVector)2 ListColumnVector (org.apache.hadoop.hive.ql.exec.vector.ListColumnVector)2 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)2 MapColumnVector (org.apache.hadoop.hive.ql.exec.vector.MapColumnVector)2