use of org.apache.arrow.memory.BufferAllocator in project flink by apache.
the class ArrowUtils method collectAsPandasDataFrame.
/**
* Convert Flink table to Pandas DataFrame.
*/
public static CustomIterator<byte[]> collectAsPandasDataFrame(Table table, int maxArrowBatchSize) throws Exception {
checkArrowUsable();
BufferAllocator allocator = getRootAllocator().newChildAllocator("collectAsPandasDataFrame", 0, Long.MAX_VALUE);
RowType rowType = (RowType) table.getResolvedSchema().toSourceRowDataType().getLogicalType();
DataType defaultRowDataType = TypeConversions.fromLogicalToDataType(rowType);
VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ArrowStreamWriter arrowStreamWriter = new ArrowStreamWriter(root, null, baos);
arrowStreamWriter.start();
Iterator<Row> results = table.execute().collect();
Iterator<Row> appendOnlyResults;
if (isAppendOnlyTable(table)) {
appendOnlyResults = results;
} else {
appendOnlyResults = filterOutRetractRows(results);
}
ArrowWriter arrowWriter = createRowDataArrowWriter(root, rowType);
Iterator convertedResults = new Iterator<RowData>() {
@Override
public boolean hasNext() {
return appendOnlyResults.hasNext();
}
@Override
public RowData next() {
DataFormatConverters.DataFormatConverter converter = DataFormatConverters.getConverterForDataType(defaultRowDataType);
return (RowData) converter.toInternal(appendOnlyResults.next());
}
};
return new CustomIterator<byte[]>() {
@Override
public boolean hasNext() {
return convertedResults.hasNext();
}
@Override
public byte[] next() {
try {
int i = 0;
while (convertedResults.hasNext() && i < maxArrowBatchSize) {
i++;
arrowWriter.write(convertedResults.next());
}
arrowWriter.finish();
arrowStreamWriter.writeBatch();
return baos.toByteArray();
} catch (Throwable t) {
String msg = "Failed to serialize the data of the table";
LOG.error(msg, t);
throw new RuntimeException(msg, t);
} finally {
arrowWriter.reset();
baos.reset();
if (!hasNext()) {
root.close();
allocator.close();
}
}
}
};
}
use of org.apache.arrow.memory.BufferAllocator in project hive by apache.
the class TestJdbcWithMiniLlapVectorArrowBatch method runQueryUsingLlapArrowBatchReader.
private MultiSet<List<Object>> runQueryUsingLlapArrowBatchReader(String query, Map<String, String> extraHiveConfs) throws Exception {
String url = miniHS2.getJdbcURL();
if (extraHiveConfs != null) {
url = url + "?" + extraHiveConfs.entrySet().stream().map(e -> e.getKey() + "=" + e.getValue()).collect(Collectors.joining(";"));
}
String user = System.getProperty("user.name");
String pwd = user;
String handleId = UUID.randomUUID().toString();
// Get splits
JobConf job = new JobConf(conf);
job.set(LlapBaseInputFormat.URL_KEY, url);
job.set(LlapBaseInputFormat.USER_KEY, user);
job.set(LlapBaseInputFormat.PWD_KEY, pwd);
job.set(LlapBaseInputFormat.QUERY_KEY, query);
job.set(LlapBaseInputFormat.HANDLE_ID, handleId);
job.set(LlapBaseInputFormat.USE_NEW_SPLIT_FORMAT, "false");
BufferAllocator allocator = RootAllocatorFactory.INSTANCE.getOrCreateRootAllocator(Long.MAX_VALUE).newChildAllocator(UUID.randomUUID().toString(), 0, Long.MAX_VALUE);
LlapBaseInputFormat llapBaseInputFormat = new LlapBaseInputFormat(true, allocator);
InputSplit[] splits = llapBaseInputFormat.getSplits(job, 1);
assertTrue(splits.length > 0);
MultiSet<List<Object>> queryResult = new HashMultiSet<>();
for (InputSplit split : splits) {
System.out.println("Processing split " + Arrays.toString(split.getLocations()));
RecordReader<NullWritable, ArrowWrapperWritable> reader = llapBaseInputFormat.getRecordReader(split, job, null);
ArrowWrapperWritable wrapperWritable = new ArrowWrapperWritable();
while (reader.next(NullWritable.get(), wrapperWritable)) {
queryResult.addAll(collectResultFromArrowVector(wrapperWritable));
}
reader.close();
}
LlapBaseInputFormat.close(handleId);
return queryResult;
}
use of org.apache.arrow.memory.BufferAllocator in project carbondata by apache.
the class ArrowCarbonReaderTest method testArrowReader.
@Test
public void testArrowReader() {
String path = "./carbondata";
try {
FileUtils.deleteDirectory(new File(path));
Field[] fields = new Field[13];
fields[0] = new Field("stringField", DataTypes.STRING);
fields[1] = new Field("shortField", DataTypes.SHORT);
fields[2] = new Field("intField", DataTypes.INT);
fields[3] = new Field("longField", DataTypes.LONG);
fields[4] = new Field("doubleField", DataTypes.DOUBLE);
fields[5] = new Field("boolField", DataTypes.BOOLEAN);
fields[6] = new Field("dateField", DataTypes.DATE);
fields[7] = new Field("timeField", DataTypes.TIMESTAMP);
fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 2));
fields[9] = new Field("varcharField", DataTypes.VARCHAR);
fields[10] = new Field("arrayField", DataTypes.createArrayType(DataTypes.STRING));
fields[11] = new Field("floatField", DataTypes.FLOAT);
fields[12] = new Field("binaryField", DataTypes.BINARY);
Map<String, String> map = new HashMap<>();
map.put("complex_delimiter_level_1", "#");
CarbonWriter writer = CarbonWriter.builder().outputPath(path).withLoadOptions(map).withCsvInput(new Schema(fields)).writtenBy("CarbonReaderTest").build();
byte[] value = "Binary".getBytes();
for (int i = 0; i < 10; i++) {
Object[] row2 = new Object[] { "robot" + (i % 10), i % 10000, i, (Long.MAX_VALUE - i), ((double) i / 2), (true), "2019-03-02", "2019-02-12 03:03:34", 12.345, "varchar", "Hello#World#From#Carbon", 1.23, value };
writer.write(row2);
}
writer.close();
// Read data
ArrowCarbonReader reader = CarbonReader.builder(path, "_temp").withRowRecordReader().buildArrowReader();
Schema carbonSchema = CarbonSchemaReader.readSchema(path);
byte[] data = reader.readArrowBatch(carbonSchema);
BufferAllocator bufferAllocator = ArrowUtils.rootAllocator.newChildAllocator("toArrowBuffer", 0, Long.MAX_VALUE);
ArrowRecordBatch arrowRecordBatch = ArrowConverter.byteArrayToArrowBatch(data, bufferAllocator);
VectorSchemaRoot vectorSchemaRoot = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(carbonSchema, TimeZone.getDefault().getID()), bufferAllocator);
VectorLoader vectorLoader = new VectorLoader(vectorSchemaRoot);
vectorLoader.load(arrowRecordBatch);
// check for 10 rows
assertEquals(vectorSchemaRoot.getRowCount(), 10);
List<FieldVector> fieldVectors = vectorSchemaRoot.getFieldVectors();
// validate short column
for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
assertEquals(((SmallIntVector) fieldVectors.get(6)).get(i), i);
}
// validate float column
for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
assertEquals(((Float4Vector) fieldVectors.get(12)).get(i), (float) 1.23);
}
// validate date column
for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
assertArrayEquals(((VarCharVector) fieldVectors.get(1)).get(i), "2019-03-02".getBytes((StandardCharsets.UTF_8)));
}
// validate timestamp column
for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
assertArrayEquals(((VarCharVector) fieldVectors.get(2)).get(i), "2019-02-12 03:03:34".getBytes((StandardCharsets.UTF_8)));
}
arrowRecordBatch.close();
vectorSchemaRoot.close();
bufferAllocator.close();
reader.close();
// Read data with address (unsafe memory)
ArrowCarbonReader reader1 = CarbonReader.builder(path, "_temp").withRowRecordReader().buildArrowReader();
long address = reader1.readArrowBatchAddress(carbonSchema);
int length = CarbonUnsafe.getUnsafe().getInt(address);
byte[] data1 = new byte[length];
CarbonUnsafe.getUnsafe().copyMemory(null, address + 4, data1, CarbonUnsafe.BYTE_ARRAY_OFFSET, length);
bufferAllocator = ArrowUtils.rootAllocator.newChildAllocator("toArrowBuffer", 0, Long.MAX_VALUE);
arrowRecordBatch = ArrowConverter.byteArrayToArrowBatch(data1, bufferAllocator);
vectorSchemaRoot = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(carbonSchema, TimeZone.getDefault().getID()), bufferAllocator);
vectorLoader = new VectorLoader(vectorSchemaRoot);
vectorLoader.load(arrowRecordBatch);
// check for 10 rows
assertEquals(vectorSchemaRoot.getRowCount(), 10);
List<FieldVector> fieldVectors1 = vectorSchemaRoot.getFieldVectors();
// validate short column
for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
assertEquals(((SmallIntVector) fieldVectors1.get(6)).get(i), i);
}
// validate float column
for (int i = 0; i < vectorSchemaRoot.getRowCount(); i++) {
assertEquals(((Float4Vector) fieldVectors1.get(12)).get(i), (float) 1.23);
}
arrowRecordBatch.close();
vectorSchemaRoot.close();
bufferAllocator.close();
// free the unsafe memory
reader1.freeArrowBatchMemory(address);
reader1.close();
// Read as arrow vector
ArrowCarbonReader reader2 = CarbonReader.builder(path, "_temp").withRowRecordReader().buildArrowReader();
VectorSchemaRoot vectorSchemaRoot2 = reader2.readArrowVectors(carbonSchema);
// check for 10 rows
assertEquals(vectorSchemaRoot2.getRowCount(), 10);
List<FieldVector> fieldVectors2 = vectorSchemaRoot2.getFieldVectors();
// validate short column
for (int i = 0; i < vectorSchemaRoot2.getRowCount(); i++) {
assertEquals(((SmallIntVector) fieldVectors2.get(6)).get(i), i);
}
// validate float column
for (int i = 0; i < vectorSchemaRoot2.getRowCount(); i++) {
assertEquals(((Float4Vector) fieldVectors2.get(12)).get(i), (float) 1.23);
}
vectorSchemaRoot.close();
reader2.close();
// Read arrowSchema
byte[] schema = CarbonSchemaReader.getArrowSchemaAsBytes(path);
bufferAllocator = ArrowUtils.rootAllocator.newChildAllocator("toArrowBuffer", 0, Long.MAX_VALUE);
arrowRecordBatch = ArrowConverter.byteArrayToArrowBatch(schema, bufferAllocator);
vectorSchemaRoot = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(carbonSchema, TimeZone.getDefault().getID()), bufferAllocator);
vectorLoader = new VectorLoader(vectorSchemaRoot);
vectorLoader.load(arrowRecordBatch);
assertEquals(vectorSchemaRoot.getSchema().getFields().size(), 13);
arrowRecordBatch.close();
vectorSchemaRoot.close();
bufferAllocator.close();
} catch (Throwable e) {
e.printStackTrace();
Assert.fail(e.getMessage());
} finally {
try {
FileUtils.deleteDirectory(new File(path));
} catch (IOException e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
}
}
Aggregations