Search in sources :

Example 1 with BufferAllocator

use of org.apache.arrow.memory.BufferAllocator in project flink by apache.

the class ArrowUtils method collectAsPandasDataFrame.

/**
 * Convert Flink table to Pandas DataFrame.
 */
public static CustomIterator<byte[]> collectAsPandasDataFrame(Table table, int maxArrowBatchSize) throws Exception {
    checkArrowUsable();
    BufferAllocator allocator = getRootAllocator().newChildAllocator("collectAsPandasDataFrame", 0, Long.MAX_VALUE);
    RowType rowType = (RowType) table.getResolvedSchema().toSourceRowDataType().getLogicalType();
    DataType defaultRowDataType = TypeConversions.fromLogicalToDataType(rowType);
    VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    ArrowStreamWriter arrowStreamWriter = new ArrowStreamWriter(root, null, baos);
    arrowStreamWriter.start();
    Iterator<Row> results = table.execute().collect();
    Iterator<Row> appendOnlyResults;
    if (isAppendOnlyTable(table)) {
        appendOnlyResults = results;
    } else {
        appendOnlyResults = filterOutRetractRows(results);
    }
    ArrowWriter arrowWriter = createRowDataArrowWriter(root, rowType);
    Iterator convertedResults = new Iterator<RowData>() {

        @Override
        public boolean hasNext() {
            return appendOnlyResults.hasNext();
        }

        @Override
        public RowData next() {
            DataFormatConverters.DataFormatConverter converter = DataFormatConverters.getConverterForDataType(defaultRowDataType);
            return (RowData) converter.toInternal(appendOnlyResults.next());
        }
    };
    return new CustomIterator<byte[]>() {

        @Override
        public boolean hasNext() {
            return convertedResults.hasNext();
        }

        @Override
        public byte[] next() {
            try {
                int i = 0;
                while (convertedResults.hasNext() && i < maxArrowBatchSize) {
                    i++;
                    arrowWriter.write(convertedResults.next());
                }
                arrowWriter.finish();
                arrowStreamWriter.writeBatch();
                return baos.toByteArray();
            } catch (Throwable t) {
                String msg = "Failed to serialize the data of the table";
                LOG.error(msg, t);
                throw new RuntimeException(msg, t);
            } finally {
                arrowWriter.reset();
                baos.reset();
                if (!hasNext()) {
                    root.close();
                    allocator.close();
                }
            }
        }
    };
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) RowType(org.apache.flink.table.types.logical.RowType) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ArrowStreamWriter(org.apache.arrow.vector.ipc.ArrowStreamWriter) BufferAllocator(org.apache.arrow.memory.BufferAllocator) RowData(org.apache.flink.table.data.RowData) DataFormatConverters(org.apache.flink.table.data.util.DataFormatConverters) Iterator(java.util.Iterator) DataType(org.apache.flink.table.types.DataType) Row(org.apache.flink.types.Row)

Example 2 with BufferAllocator

use of org.apache.arrow.memory.BufferAllocator in project hive by apache.

the class TestJdbcWithMiniLlapVectorArrowBatch method runQueryUsingLlapArrowBatchReader.

private MultiSet<List<Object>> runQueryUsingLlapArrowBatchReader(String query, Map<String, String> extraHiveConfs) throws Exception {
    String url = miniHS2.getJdbcURL();
    if (extraHiveConfs != null) {
        url = url + "?" + extraHiveConfs.entrySet().stream().map(e -> e.getKey() + "=" + e.getValue()).collect(Collectors.joining(";"));
    }
    String user = System.getProperty("user.name");
    String pwd = user;
    String handleId = UUID.randomUUID().toString();
    // Get splits
    JobConf job = new JobConf(conf);
    job.set(LlapBaseInputFormat.URL_KEY, url);
    job.set(LlapBaseInputFormat.USER_KEY, user);
    job.set(LlapBaseInputFormat.PWD_KEY, pwd);
    job.set(LlapBaseInputFormat.QUERY_KEY, query);
    job.set(LlapBaseInputFormat.HANDLE_ID, handleId);
    job.set(LlapBaseInputFormat.USE_NEW_SPLIT_FORMAT, "false");
    BufferAllocator allocator = RootAllocatorFactory.INSTANCE.getOrCreateRootAllocator(Long.MAX_VALUE).newChildAllocator(UUID.randomUUID().toString(), 0, Long.MAX_VALUE);
    LlapBaseInputFormat llapBaseInputFormat = new LlapBaseInputFormat(true, allocator);
    InputSplit[] splits = llapBaseInputFormat.getSplits(job, 1);
    assertTrue(splits.length > 0);
    MultiSet<List<Object>> queryResult = new HashMultiSet<>();
    for (InputSplit split : splits) {
        System.out.println("Processing split " + Arrays.toString(split.getLocations()));
        RecordReader<NullWritable, ArrowWrapperWritable> reader = llapBaseInputFormat.getRecordReader(split, job, null);
        ArrowWrapperWritable wrapperWritable = new ArrowWrapperWritable();
        while (reader.next(NullWritable.get(), wrapperWritable)) {
            queryResult.addAll(collectResultFromArrowVector(wrapperWritable));
        }
        reader.close();
    }
    LlapBaseInputFormat.close(handleId);
    return queryResult;
}
Also used : Arrays(java.util.Arrays) NullWritable(org.apache.hadoop.io.NullWritable) BeforeClass(org.junit.BeforeClass) ArrowWrapperWritable(org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) LLAP_EXTERNAL_CLIENT_USE_HYBRID_CALENDAR(org.apache.hadoop.hive.conf.HiveConf.ConfVars.LLAP_EXTERNAL_CLIENT_USE_HYBRID_CALENDAR) RootAllocatorFactory(org.apache.hadoop.hive.ql.io.arrow.RootAllocatorFactory) ArrayList(java.util.ArrayList) Row(org.apache.hadoop.hive.llap.Row) SQLException(java.sql.SQLException) Lists(com.google.common.collect.Lists) Map(java.util.Map) InputFormat(org.apache.hadoop.mapred.InputFormat) BufferAllocator(org.apache.arrow.memory.BufferAllocator) FieldVector(org.apache.arrow.vector.FieldVector) CalendarUtils(org.apache.hadoop.hive.common.type.CalendarUtils) ImmutableMap(com.google.common.collect.ImmutableMap) HashMultiSet(org.apache.commons.collections4.multiset.HashMultiSet) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) UUID(java.util.UUID) LlapArrowRowInputFormat(org.apache.hadoop.hive.llap.LlapArrowRowInputFormat) HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT(org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT) Collectors(java.util.stream.Collectors) JobConf(org.apache.hadoop.mapred.JobConf) List(java.util.List) Ignore(org.junit.Ignore) InputSplit(org.apache.hadoop.mapred.InputSplit) Statement(java.sql.Statement) RecordReader(org.apache.hadoop.mapred.RecordReader) HIVE_AVRO_PROLEPTIC_GREGORIAN_DEFAULT(org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_AVRO_PROLEPTIC_GREGORIAN_DEFAULT) MultiSet(org.apache.commons.collections4.MultiSet) Assert.assertEquals(org.junit.Assert.assertEquals) LlapBaseInputFormat(org.apache.hadoop.hive.llap.LlapBaseInputFormat) ArrowWrapperWritable(org.apache.hadoop.hive.ql.io.arrow.ArrowWrapperWritable) HashMultiSet(org.apache.commons.collections4.multiset.HashMultiSet) NullWritable(org.apache.hadoop.io.NullWritable) BufferAllocator(org.apache.arrow.memory.BufferAllocator) LlapBaseInputFormat(org.apache.hadoop.hive.llap.LlapBaseInputFormat) ArrayList(java.util.ArrayList) List(java.util.List) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 3 with BufferAllocator

use of org.apache.arrow.memory.BufferAllocator in project carbondata by apache.

the class ArrowCarbonReaderTest method testArrowReader.

@Test
public void testArrowReader() {
    String path = "./carbondata";
    try {
        FileUtils.deleteDirectory(new File(path));
        Field[] fields = new Field[13];
        fields[0] = new Field("stringField", DataTypes.STRING);
        fields[1] = new Field("shortField", DataTypes.SHORT);
        fields[2] = new Field("intField", DataTypes.INT);
        fields[3] = new Field("longField", DataTypes.LONG);
        fields[4] = new Field("doubleField", DataTypes.DOUBLE);
        fields[5] = new Field("boolField", DataTypes.BOOLEAN);
        fields[6] = new Field("dateField", DataTypes.DATE);
        fields[7] = new Field("timeField", DataTypes.TIMESTAMP);
        fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 2));
        fields[9] = new Field("varcharField", DataTypes.VARCHAR);
        fields[10] = new Field("arrayField", DataTypes.createArrayType(DataTypes.STRING));
        fields[11] = new Field("floatField", DataTypes.FLOAT);
        fields[12] = new Field("binaryField", DataTypes.BINARY);
        Map<String, String> map = new HashMap<>();
        map.put("complex_delimiter_level_1", "#");
        CarbonWriter writer = CarbonWriter.builder().outputPath(path).withLoadOptions(map).withCsvInput(new Schema(fields)).writtenBy("CarbonReaderTest").build();
        byte[] value = "Binary".getBytes();
        for (int i = 0; i < 10; i++) {
            Object[] row2 = new Object[] { "robot" + (i % 10), i % 10000, i, (Long.MAX_VALUE - i), ((double) i / 2), (true), "2019-03-02", "2019-02-12 03:03:34", 12.345, "varchar", "Hello#World#From#Carbon", 1.23, value };
            writer.write(row2);
        }
        writer.close();
        // Read data
        ArrowCarbonReader reader = CarbonReader.builder(path, "_temp").withRowRecordReader().buildArrowReader();
        Schema carbonSchema = CarbonSchemaReader.readSchema(path);
        byte[] data = reader.readArrowBatch(carbonSchema);
        BufferAllocator bufferAllocator = ArrowUtils.rootAllocator.newChildAllocator("toArrowBuffer", 0, Long.MAX_VALUE);
        ArrowRecordBatch arrowRecordBatch = ArrowConverter.byteArrayToArrowBatch(data, bufferAllocator);
        VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(carbonSchema, TimeZone.getDefault().getID()), bufferAllocator);
        VectorLoader vectorLoader = new VectorLoader(vectorSchemaRoot);
        vectorLoader.load(arrowRecordBatch);
        // check for 10 rows
        assertEquals(vectorSchemaRoot.getRowCount(), 10);
        List<FieldVector> fieldVectors = vectorSchemaRoot.getFieldVectors();
        // validate short column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertEquals(((SmallIntVector) fieldVectors.get(6)).get(i), i);
        }
        // validate float column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertEquals(((Float4Vector) fieldVectors.get(12)).get(i), (float) 1.23);
        }
        // validate date column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertArrayEquals(((VarCharVector) fieldVectors.get(1)).get(i), "2019-03-02".getBytes((StandardCharsets.UTF_8)));
        }
        // validate timestamp column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertArrayEquals(((VarCharVector) fieldVectors.get(2)).get(i), "2019-02-12 03:03:34".getBytes((StandardCharsets.UTF_8)));
        }
        arrowRecordBatch.close();
        vectorSchemaRoot.close();
        bufferAllocator.close();
        reader.close();
        // Read data with address (unsafe memory)
        ArrowCarbonReader reader1 = CarbonReader.builder(path, "_temp").withRowRecordReader().buildArrowReader();
        long address = reader1.readArrowBatchAddress(carbonSchema);
        int length = CarbonUnsafe.getUnsafe().getInt(address);
        byte[] data1 = new byte[length];
        CarbonUnsafe.getUnsafe().copyMemory(null, address + 4, data1, CarbonUnsafe.BYTE_ARRAY_OFFSET, length);
        bufferAllocator = ArrowUtils.rootAllocator.newChildAllocator("toArrowBuffer", 0, Long.MAX_VALUE);
        arrowRecordBatch = ArrowConverter.byteArrayToArrowBatch(data1, bufferAllocator);
        vectorSchemaRoot = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(carbonSchema, TimeZone.getDefault().getID()), bufferAllocator);
        vectorLoader = new VectorLoader(vectorSchemaRoot);
        vectorLoader.load(arrowRecordBatch);
        // check for 10 rows
        assertEquals(vectorSchemaRoot.getRowCount(), 10);
        List<FieldVector> fieldVectors1 = vectorSchemaRoot.getFieldVectors();
        // validate short column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertEquals(((SmallIntVector) fieldVectors1.get(6)).get(i), i);
        }
        // validate float column
        for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
            assertEquals(((Float4Vector) fieldVectors1.get(12)).get(i), (float) 1.23);
        }
        arrowRecordBatch.close();
        vectorSchemaRoot.close();
        bufferAllocator.close();
        // free the unsafe memory
        reader1.freeArrowBatchMemory(address);
        reader1.close();
        // Read as arrow vector
        ArrowCarbonReader reader2 = CarbonReader.builder(path, "_temp").withRowRecordReader().buildArrowReader();
        VectorSchemaRoot vectorSchemaRoot2 = reader2.readArrowVectors(carbonSchema);
        // check for 10 rows
        assertEquals(vectorSchemaRoot2.getRowCount(), 10);
        List<FieldVector> fieldVectors2 = vectorSchemaRoot2.getFieldVectors();
        // validate short column
        for (int i = 0; i < vectorSchemaRoot2.getRowCount(); i++) {
            assertEquals(((SmallIntVector) fieldVectors2.get(6)).get(i), i);
        }
        // validate float column
        for (int i = 0; i < vectorSchemaRoot2.getRowCount(); i++) {
            assertEquals(((Float4Vector) fieldVectors2.get(12)).get(i), (float) 1.23);
        }
        vectorSchemaRoot.close();
        reader2.close();
        // Read arrowSchema
        byte[] schema = CarbonSchemaReader.getArrowSchemaAsBytes(path);
        bufferAllocator = ArrowUtils.rootAllocator.newChildAllocator("toArrowBuffer", 0, Long.MAX_VALUE);
        arrowRecordBatch = ArrowConverter.byteArrayToArrowBatch(schema, bufferAllocator);
        vectorSchemaRoot = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(carbonSchema, TimeZone.getDefault().getID()), bufferAllocator);
        vectorLoader = new VectorLoader(vectorSchemaRoot);
        vectorLoader.load(arrowRecordBatch);
        assertEquals(vectorSchemaRoot.getSchema().getFields().size(), 13);
        arrowRecordBatch.close();
        vectorSchemaRoot.close();
        bufferAllocator.close();
    } catch (Throwable e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    } finally {
        try {
            FileUtils.deleteDirectory(new File(path));
        } catch (IOException e) {
            e.printStackTrace();
            Assert.fail(e.getMessage());
        }
    }
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) VectorLoader(org.apache.arrow.vector.VectorLoader) HashMap(java.util.HashMap) FieldVector(org.apache.arrow.vector.FieldVector) IOException(java.io.IOException) BufferAllocator(org.apache.arrow.memory.BufferAllocator) Field(org.apache.carbondata.core.metadata.datatype.Field) ArrowRecordBatch(org.apache.arrow.vector.ipc.message.ArrowRecordBatch) File(java.io.File) Test(org.junit.Test)

Aggregations

BufferAllocator (org.apache.arrow.memory.BufferAllocator)3 FieldVector (org.apache.arrow.vector.FieldVector)2 VectorSchemaRoot (org.apache.arrow.vector.VectorSchemaRoot)2 Test (org.junit.Test)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 Lists (com.google.common.collect.Lists)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 File (java.io.File)1 IOException (java.io.IOException)1 SQLException (java.sql.SQLException)1 Statement (java.sql.Statement)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 HashMap (java.util.HashMap)1 Iterator (java.util.Iterator)1 List (java.util.List)1 Map (java.util.Map)1 UUID (java.util.UUID)1 Collectors (java.util.stream.Collectors)1 VectorLoader (org.apache.arrow.vector.VectorLoader)1