Search in sources :

Example 1 with ArrowStreamWriter

use of org.apache.arrow.vector.ipc.ArrowStreamWriter in project flink by apache.

the class ArrowUtils method collectAsPandasDataFrame.

/**
 * Convert Flink table to Pandas DataFrame.
 */
public static CustomIterator<byte[]> collectAsPandasDataFrame(Table table, int maxArrowBatchSize) throws Exception {
    checkArrowUsable();
    BufferAllocator allocator = getRootAllocator().newChildAllocator("collectAsPandasDataFrame", 0, Long.MAX_VALUE);
    RowType rowType = (RowType) table.getResolvedSchema().toSourceRowDataType().getLogicalType();
    DataType defaultRowDataType = TypeConversions.fromLogicalToDataType(rowType);
    VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    ArrowStreamWriter arrowStreamWriter = new ArrowStreamWriter(root, null, baos);
    arrowStreamWriter.start();
    Iterator<Row> results = table.execute().collect();
    Iterator<Row> appendOnlyResults;
    if (isAppendOnlyTable(table)) {
        appendOnlyResults = results;
    } else {
        appendOnlyResults = filterOutRetractRows(results);
    }
    ArrowWriter arrowWriter = createRowDataArrowWriter(root, rowType);
    Iterator convertedResults = new Iterator<RowData>() {

        @Override
        public boolean hasNext() {
            return appendOnlyResults.hasNext();
        }

        @Override
        public RowData next() {
            DataFormatConverters.DataFormatConverter converter = DataFormatConverters.getConverterForDataType(defaultRowDataType);
            return (RowData) converter.toInternal(appendOnlyResults.next());
        }
    };
    return new CustomIterator<byte[]>() {

        @Override
        public boolean hasNext() {
            return convertedResults.hasNext();
        }

        @Override
        public byte[] next() {
            try {
                int i = 0;
                while (convertedResults.hasNext() && i < maxArrowBatchSize) {
                    i++;
                    arrowWriter.write(convertedResults.next());
                }
                arrowWriter.finish();
                arrowStreamWriter.writeBatch();
                return baos.toByteArray();
            } catch (Throwable t) {
                String msg = "Failed to serialize the data of the table";
                LOG.error(msg, t);
                throw new RuntimeException(msg, t);
            } finally {
                arrowWriter.reset();
                baos.reset();
                if (!hasNext()) {
                    root.close();
                    allocator.close();
                }
            }
        }
    };
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) RowType(org.apache.flink.table.types.logical.RowType) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ArrowStreamWriter(org.apache.arrow.vector.ipc.ArrowStreamWriter) BufferAllocator(org.apache.arrow.memory.BufferAllocator) RowData(org.apache.flink.table.data.RowData) DataFormatConverters(org.apache.flink.table.data.util.DataFormatConverters) Iterator(java.util.Iterator) DataType(org.apache.flink.table.types.DataType) Row(org.apache.flink.types.Row)

Example 2 with ArrowStreamWriter

use of org.apache.arrow.vector.ipc.ArrowStreamWriter in project flink by apache.

the class ArrowReaderWriterTest method createArrowWriter.

@Override
public Tuple2<ArrowWriter<RowData>, ArrowStreamWriter> createArrowWriter(OutputStream outputStream) throws IOException {
    VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
    ArrowWriter<RowData> arrowWriter = ArrowUtils.createRowDataArrowWriter(root, rowType);
    ArrowStreamWriter arrowStreamWriter = new ArrowStreamWriter(root, null, outputStream);
    arrowStreamWriter.start();
    return Tuple2.of(arrowWriter, arrowStreamWriter);
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) GenericRowData(org.apache.flink.table.data.GenericRowData) RowData(org.apache.flink.table.data.RowData) BinaryRowData(org.apache.flink.table.data.binary.BinaryRowData) ArrowStreamWriter(org.apache.arrow.vector.ipc.ArrowStreamWriter)

Example 3 with ArrowStreamWriter

use of org.apache.arrow.vector.ipc.ArrowStreamWriter in project hive by apache.

the class LlapArrowRecordWriter method write.

@Override
public void write(K key, V value) throws IOException {
    ArrowWrapperWritable arrowWrapperWritable = (ArrowWrapperWritable) value;
    if (arrowStreamWriter == null) {
        vectorSchemaRoot = arrowWrapperWritable.getVectorSchemaRoot();
        arrowStreamWriter = new ArrowStreamWriter(vectorSchemaRoot, null, out);
        allocator = arrowWrapperWritable.getAllocator();
        this.out.setAllocator(allocator);
        rootVector = arrowWrapperWritable.getRootVector();
    } else {
        // We need to set the row count for the current vector
        // since root is reused by the stream writer.
        vectorSchemaRoot.setRowCount(rootVector.getValueCount());
    }
    arrowStreamWriter.writeBatch();
}
Also used : ArrowWrapperWritable(org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable) ArrowStreamWriter(org.apache.arrow.vector.ipc.ArrowStreamWriter)

Example 4 with ArrowStreamWriter

use of org.apache.arrow.vector.ipc.ArrowStreamWriter in project flink by apache.

the class ArrowSerializer method resetWriter.

public void resetWriter() throws IOException {
    arrowStreamWriter = new ArrowStreamWriter(rootWriter, null, baos);
    arrowStreamWriter.start();
}
Also used : ArrowStreamWriter(org.apache.arrow.vector.ipc.ArrowStreamWriter)

Example 5 with ArrowStreamWriter

use of org.apache.arrow.vector.ipc.ArrowStreamWriter in project flink by apache.

the class ArrowSerializer method open.

public void open(InputStream bais, OutputStream baos) throws Exception {
    this.bais = bais;
    this.baos = baos;
    allocator = ArrowUtils.getRootAllocator().newChildAllocator("allocator", 0, Long.MAX_VALUE);
    arrowStreamReader = new ArrowStreamReader(bais, allocator);
    rootWriter = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(inputType), allocator);
    arrowWriter = createArrowWriter();
    arrowStreamWriter = new ArrowStreamWriter(rootWriter, null, baos);
    arrowStreamWriter.start();
}
Also used : ArrowStreamReader(org.apache.arrow.vector.ipc.ArrowStreamReader) ArrowStreamWriter(org.apache.arrow.vector.ipc.ArrowStreamWriter)

Aggregations

ArrowStreamWriter (org.apache.arrow.vector.ipc.ArrowStreamWriter)8 RowData (org.apache.flink.table.data.RowData)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 VectorSchemaRoot (org.apache.arrow.vector.VectorSchemaRoot)3 ArrayList (java.util.ArrayList)2 List (java.util.List)2 GenericRowData (org.apache.flink.table.data.GenericRowData)2 Test (org.junit.Test)2 IOException (java.io.IOException)1 Iterator (java.util.Iterator)1 BufferAllocator (org.apache.arrow.memory.BufferAllocator)1 ArrowStreamReader (org.apache.arrow.vector.ipc.ArrowStreamReader)1 RuntimeContext (org.apache.flink.api.common.functions.RuntimeContext)1 BinaryRowData (org.apache.flink.table.data.binary.BinaryRowData)1 DataFormatConverters (org.apache.flink.table.data.util.DataFormatConverters)1 DataType (org.apache.flink.table.types.DataType)1 RowType (org.apache.flink.table.types.logical.RowType)1 Row (org.apache.flink.types.Row)1 ArrowWrapperWritable (org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable)1