Search in sources :

Example 1 with ArrowRecordBatch

use of org.apache.arrow.vector.ipc.message.ArrowRecordBatch in project beam by apache.

the class ArrowConversion method rowsFromSerializedRecordBatch.

@SuppressWarnings("nullness")
public static RecordBatchRowIterator rowsFromSerializedRecordBatch(org.apache.arrow.vector.types.pojo.Schema arrowSchema, InputStream inputStream, RootAllocator allocator) throws IOException {
    VectorSchemaRoot vectorRoot = VectorSchemaRoot.create(arrowSchema, allocator);
    VectorLoader vectorLoader = new VectorLoader(vectorRoot);
    vectorRoot.clear();
    try (ReadChannel read = new ReadChannel(Channels.newChannel(inputStream))) {
        try (ArrowRecordBatch arrowMessage = MessageSerializer.deserializeRecordBatch(read, allocator)) {
            vectorLoader.load(arrowMessage);
        }
    }
    return rowsFromRecordBatch(ArrowSchemaTranslator.toBeamSchema(arrowSchema), vectorRoot);
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) VectorLoader(org.apache.arrow.vector.VectorLoader) ArrowRecordBatch(org.apache.arrow.vector.ipc.message.ArrowRecordBatch) ReadChannel(org.apache.arrow.vector.ipc.ReadChannel)

Example 2 with ArrowRecordBatch

use of org.apache.arrow.vector.ipc.message.ArrowRecordBatch in project flink by apache.

the class ArrowSourceFunction method run.

@Override
public void run(SourceContext<RowData> ctx) throws Exception {
    VectorLoader vectorLoader = new VectorLoader(root);
    while (running && !indexesToEmit.isEmpty()) {
        Tuple2<Integer, Integer> indexToEmit = indexesToEmit.peek();
        ArrowRecordBatch arrowRecordBatch = loadBatch(indexToEmit.f0);
        vectorLoader.load(arrowRecordBatch);
        arrowRecordBatch.close();
        ArrowReader arrowReader = createArrowReader(root);
        int rowCount = root.getRowCount();
        int nextRowId = indexToEmit.f1;
        while (nextRowId < rowCount) {
            RowData element = arrowReader.read(nextRowId);
            synchronized (ctx.getCheckpointLock()) {
                ctx.collect(element);
                indexToEmit.setField(++nextRowId, 1);
            }
        }
        synchronized (ctx.getCheckpointLock()) {
            indexesToEmit.pop();
        }
    }
}
Also used : VectorLoader(org.apache.arrow.vector.VectorLoader) RowData(org.apache.flink.table.data.RowData) ArrowRecordBatch(org.apache.arrow.vector.ipc.message.ArrowRecordBatch) ArrowReader(org.apache.flink.table.runtime.arrow.ArrowReader)

Example 3 with ArrowRecordBatch

use of org.apache.arrow.vector.ipc.message.ArrowRecordBatch in project carbondata by apache.

the class ArrowCarbonReaderTest method testArrowReader.

@Test
public void testArrowReader() {
    String path = "./carbondata";
    try {
        FileUtils.deleteDirectory(new File(path));
        Field[] fields = new Field[13];
        fields[0] = new Field("stringField", DataTypes.STRING);
        fields[1] = new Field("shortField", DataTypes.SHORT);
        fields[2] = new Field("intField", DataTypes.INT);
        fields[3] = new Field("longField", DataTypes.LONG);
        fields[4] = new Field("doubleField", DataTypes.DOUBLE);
        fields[5] = new Field("boolField", DataTypes.BOOLEAN);
        fields[6] = new Field("dateField", DataTypes.DATE);
        fields[7] = new Field("timeField", DataTypes.TIMESTAMP);
        fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 2));
        fields[9] = new Field("varcharField", DataTypes.VARCHAR);
        fields[10] = new Field("arrayField", DataTypes.createArrayType(DataTypes.STRING));
        fields[11] = new Field("floatField", DataTypes.FLOAT);
        fields[12] = new Field("binaryField", DataTypes.BINARY);
        Map<String, String> map = new HashMap<>();
        map.put("complex_delimiter_level_1", "#");
        CarbonWriter writer = CarbonWriter.builder().outputPath(path).withLoadOptions(map).withCsvInput(new Schema(fields)).writtenBy("CarbonReaderTest").build();
        byte[] value = "Binary".getBytes();
        for (int i = 0; i < 10; i++) {
            Object[] row2 = new Object[] { "robot" + (i % 10), i % 10000, i, (Long.MAX_VALUE - i), ((double) i / 2), (true), "2019-03-02", "2019-02-12 03:03:34", 12.345, "varchar", "Hello#World#From#Carbon", 1.23, value };
            writer.write(row2);
        }
        writer.close();
        // Read data
        ArrowCarbonReader reader = CarbonReader.builder(path, "_temp").withRowRecordReader().buildArrowReader();
        Schema carbonSchema = CarbonSchemaReader.readSchema(path);
        byte[] data = reader.readArrowBatch(carbonSchema);
        BufferAllocator bufferAllocator = ArrowUtils.rootAllocator.newChildAllocator("toArrowBuffer", 0, Long.MAX_VALUE);
        ArrowRecordBatch arrowRecordBatch = ArrowConverter.byteArrayToArrowBatch(data, bufferAllocator);
        VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(carbonSchema, TimeZone.getDefault().getID()), bufferAllocator);
        VectorLoader vectorLoader = new VectorLoader(vectorSchemaRoot);
        vectorLoader.load(arrowRecordBatch);
        // check for 10 rows
        assertEquals(vectorSchemaRoot.getRowCount(), 10);
        List<FieldVector> fieldVectors = vectorSchemaRoot.getFieldVectors();
        // validate short column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertEquals(((SmallIntVector) fieldVectors.get(6)).get(i), i);
        }
        // validate float column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertEquals(((Float4Vector) fieldVectors.get(12)).get(i), (float) 1.23);
        }
        // validate date column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertArrayEquals(((VarCharVector) fieldVectors.get(1)).get(i), "2019-03-02".getBytes((StandardCharsets.UTF_8)));
        }
        // validate timestamp column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertArrayEquals(((VarCharVector) fieldVectors.get(2)).get(i), "2019-02-12 03:03:34".getBytes((StandardCharsets.UTF_8)));
        }
        arrowRecordBatch.close();
        vectorSchemaRoot.close();
        bufferAllocator.close();
        reader.close();
        // Read data with address (unsafe memory)
        ArrowCarbonReader reader1 = CarbonReader.builder(path, "_temp").withRowRecordReader().buildArrowReader();
        long address = reader1.readArrowBatchAddress(carbonSchema);
        int length = CarbonUnsafe.getUnsafe().getInt(address);
        byte[] data1 = new byte[length];
        CarbonUnsafe.getUnsafe().copyMemory(null, address + 4, data1, CarbonUnsafe.BYTE_ARRAY_OFFSET, length);
        bufferAllocator = ArrowUtils.rootAllocator.newChildAllocator("toArrowBuffer", 0, Long.MAX_VALUE);
        arrowRecordBatch = ArrowConverter.byteArrayToArrowBatch(data1, bufferAllocator);
        vectorSchemaRoot = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(carbonSchema, TimeZone.getDefault().getID()), bufferAllocator);
        vectorLoader = new VectorLoader(vectorSchemaRoot);
        vectorLoader.load(arrowRecordBatch);
        // check for 10 rows
        assertEquals(vectorSchemaRoot.getRowCount(), 10);
        List<FieldVector> fieldVectors1 = vectorSchemaRoot.getFieldVectors();
        // validate short column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertEquals(((SmallIntVector) fieldVectors1.get(6)).get(i), i);
        }
        // validate float column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertEquals(((Float4Vector) fieldVectors1.get(12)).get(i), (float) 1.23);
        }
        arrowRecordBatch.close();
        vectorSchemaRoot.close();
        bufferAllocator.close();
        // free the unsafe memory
        reader1.freeArrowBatchMemory(address);
        reader1.close();
        // Read as arrow vector
        ArrowCarbonReader reader2 = CarbonReader.builder(path, "_temp").withRowRecordReader().buildArrowReader();
        VectorSchemaRoot vectorSchemaRoot2 = reader2.readArrowVectors(carbonSchema);
        // check for 10 rows
        assertEquals(vectorSchemaRoot2.getRowCount(), 10);
        List<FieldVector> fieldVectors2 = vectorSchemaRoot2.getFieldVectors();
        // validate short column
        for (int i = 0; i < vectorSchemaRoot2.getRowCount(); i++) {
            assertEquals(((SmallIntVector) fieldVectors2.get(6)).get(i), i);
        }
        // validate float column
        for (int i = 0; i < vectorSchemaRoot2.getRowCount(); i++) {
            assertEquals(((Float4Vector) fieldVectors2.get(12)).get(i), (float) 1.23);
        }
        vectorSchemaRoot.close();
        reader2.close();
        // Read arrowSchema
        byte[] schema = CarbonSchemaReader.getArrowSchemaAsBytes(path);
        bufferAllocator = ArrowUtils.rootAllocator.newChildAllocator("toArrowBuffer", 0, Long.MAX_VALUE);
        arrowRecordBatch = ArrowConverter.byteArrayToArrowBatch(schema, bufferAllocator);
        vectorSchemaRoot = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(carbonSchema, TimeZone.getDefault().getID()), bufferAllocator);
        vectorLoader = new VectorLoader(vectorSchemaRoot);
        vectorLoader.load(arrowRecordBatch);
        assertEquals(vectorSchemaRoot.getSchema().getFields().size(), 13);
        arrowRecordBatch.close();
        vectorSchemaRoot.close();
        bufferAllocator.close();
    } catch (Throwable e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    } finally {
        try {
            FileUtils.deleteDirectory(new File(path));
        } catch (IOException e) {
            e.printStackTrace();
            Assert.fail(e.getMessage());
        }
    }
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) VectorLoader(org.apache.arrow.vector.VectorLoader) HashMap(java.util.HashMap) FieldVector(org.apache.arrow.vector.FieldVector) IOException(java.io.IOException) BufferAllocator(org.apache.arrow.memory.BufferAllocator) Field(org.apache.carbondata.core.metadata.datatype.Field) ArrowRecordBatch(org.apache.arrow.vector.ipc.message.ArrowRecordBatch) File(java.io.File) Test(org.junit.Test)

Aggregations

VectorLoader (org.apache.arrow.vector.VectorLoader)3 ArrowRecordBatch (org.apache.arrow.vector.ipc.message.ArrowRecordBatch)3 VectorSchemaRoot (org.apache.arrow.vector.VectorSchemaRoot)2 File (java.io.File)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 BufferAllocator (org.apache.arrow.memory.BufferAllocator)1 FieldVector (org.apache.arrow.vector.FieldVector)1 ReadChannel (org.apache.arrow.vector.ipc.ReadChannel)1 Field (org.apache.carbondata.core.metadata.datatype.Field)1 RowData (org.apache.flink.table.data.RowData)1 ArrowReader (org.apache.flink.table.runtime.arrow.ArrowReader)1 Test (org.junit.Test)1