use of io.trino.orc.OrcDataSource in project trino by trinodb.
the class SortingFileWriter method mergeFiles.
private void mergeFiles(Iterable<TempFile> files, Consumer<Page> consumer) {
try (Closer closer = Closer.create()) {
Collection<Iterator<Page>> iterators = new ArrayList<>();
for (TempFile tempFile : files) {
Path file = tempFile.getPath();
OrcDataSource dataSource = new HdfsOrcDataSource(new OrcDataSourceId(file.toString()), fileSystem.getFileStatus(file).getLen(), new OrcReaderOptions(), fileSystem.open(file), new FileFormatDataSourceStats());
closer.register(dataSource);
iterators.add(new TempFileReader(types, dataSource));
}
new MergingPageIterator(iterators, types, sortFields, sortOrders, typeOperators).forEachRemaining(consumer);
for (TempFile tempFile : files) {
Path file = tempFile.getPath();
if (!fileSystem.delete(file, false)) {
throw new IOException("Failed to delete temporary file: " + file);
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
use of io.trino.orc.OrcDataSource in project trino by trinodb.
the class TestOrcFileRewriter method testRewrite.
@Test
public void testRewrite() throws Exception {
ArrayType arrayType = new ArrayType(BIGINT);
ArrayType arrayOfArrayType = new ArrayType(arrayType);
Type mapType = TESTING_TYPE_MANAGER.getParameterizedType(StandardTypes.MAP, ImmutableList.of(TypeSignatureParameter.typeParameter(createVarcharType(5).getTypeSignature()), TypeSignatureParameter.typeParameter(BOOLEAN.getTypeSignature())));
List<Long> columnIds = ImmutableList.of(3L, 7L, 9L, 10L, 11L, 12L);
DecimalType decimalType = DecimalType.createDecimalType(4, 4);
List<Type> columnTypes = ImmutableList.of(BIGINT, createVarcharType(20), arrayType, mapType, arrayOfArrayType, decimalType);
File file = temporary.resolve(randomUUID().toString()).toFile();
try (OrcFileWriter writer = new OrcFileWriter(columnIds, columnTypes, file)) {
List<Page> pages = rowPagesBuilder(columnTypes).row(123L, "hello", arrayBlockOf(BIGINT, 1, 2), mapBlockOf(createVarcharType(5), BOOLEAN, "k1", true), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 5)), new BigDecimal("2.3")).row(777L, "sky", arrayBlockOf(BIGINT, 3, 4), mapBlockOf(createVarcharType(5), BOOLEAN, "k2", false), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 6)), new BigDecimal("2.3")).row(456L, "bye", arrayBlockOf(BIGINT, 5, 6), mapBlockOf(createVarcharType(5), BOOLEAN, "k3", true), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 7)), new BigDecimal("2.3")).row(888L, "world", arrayBlockOf(BIGINT, 7, 8), mapBlockOf(createVarcharType(5), BOOLEAN, "k4", true), arrayBlockOf(arrayType, null, arrayBlockOf(BIGINT, 8), null), new BigDecimal("2.3")).row(999L, "done", arrayBlockOf(BIGINT, 9, 10), mapBlockOf(createVarcharType(5), BOOLEAN, "k5", true), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 9, 10)), new BigDecimal("2.3")).build();
writer.appendPages(pages);
}
try (OrcDataSource dataSource = fileOrcDataSource(file)) {
OrcRecordReader reader = createReader(dataSource, columnIds, columnTypes);
assertEquals(reader.getReaderRowCount(), 5);
assertEquals(reader.getFileRowCount(), 5);
assertEquals(reader.getSplitLength(), file.length());
Page page = reader.nextPage();
assertEquals(page.getPositionCount(), 5);
Block column0 = page.getBlock(0);
assertEquals(column0.getPositionCount(), 5);
for (int i = 0; i < 5; i++) {
assertEquals(column0.isNull(i), false);
}
assertEquals(BIGINT.getLong(column0, 0), 123L);
assertEquals(BIGINT.getLong(column0, 1), 777L);
assertEquals(BIGINT.getLong(column0, 2), 456L);
assertEquals(BIGINT.getLong(column0, 3), 888L);
assertEquals(BIGINT.getLong(column0, 4), 999L);
Block column1 = page.getBlock(1);
assertEquals(column1.getPositionCount(), 5);
for (int i = 0; i < 5; i++) {
assertEquals(column1.isNull(i), false);
}
assertEquals(createVarcharType(20).getSlice(column1, 0), utf8Slice("hello"));
assertEquals(createVarcharType(20).getSlice(column1, 1), utf8Slice("sky"));
assertEquals(createVarcharType(20).getSlice(column1, 2), utf8Slice("bye"));
assertEquals(createVarcharType(20).getSlice(column1, 3), utf8Slice("world"));
assertEquals(createVarcharType(20).getSlice(column1, 4), utf8Slice("done"));
Block column2 = page.getBlock(2);
assertEquals(column2.getPositionCount(), 5);
for (int i = 0; i < 5; i++) {
assertEquals(column2.isNull(i), false);
}
assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 0), arrayBlockOf(BIGINT, 1, 2)));
assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 1), arrayBlockOf(BIGINT, 3, 4)));
assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 2), arrayBlockOf(BIGINT, 5, 6)));
assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 3), arrayBlockOf(BIGINT, 7, 8)));
assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 4), arrayBlockOf(BIGINT, 9, 10)));
Block column3 = page.getBlock(3);
assertEquals(column3.getPositionCount(), 5);
for (int i = 0; i < 5; i++) {
assertEquals(column3.isNull(i), false);
}
assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 0), mapBlockOf(createVarcharType(5), BOOLEAN, "k1", true)));
assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 1), mapBlockOf(createVarcharType(5), BOOLEAN, "k2", false)));
assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 2), mapBlockOf(createVarcharType(5), BOOLEAN, "k3", true)));
assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 3), mapBlockOf(createVarcharType(5), BOOLEAN, "k4", true)));
assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 4), mapBlockOf(createVarcharType(5), BOOLEAN, "k5", true)));
Block column4 = page.getBlock(4);
assertEquals(column4.getPositionCount(), 5);
for (int i = 0; i < 5; i++) {
assertEquals(column4.isNull(i), false);
}
assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 0), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 5))));
assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 1), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 6))));
assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 2), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 7))));
assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 3), arrayBlockOf(arrayType, null, arrayBlockOf(BIGINT, 8), null)));
assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 4), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 9, 10))));
assertNull(reader.nextPage());
OrcFileMetadata orcFileMetadata = METADATA_CODEC.fromJson(reader.getUserMetadata().get(OrcFileMetadata.KEY).getBytes());
assertEquals(orcFileMetadata, new OrcFileMetadata(ImmutableMap.<Long, TypeId>builder().put(3L, BIGINT.getTypeId()).put(7L, createVarcharType(20).getTypeId()).put(9L, arrayType.getTypeId()).put(10L, mapType.getTypeId()).put(11L, arrayOfArrayType.getTypeId()).put(12L, decimalType.getTypeId()).buildOrThrow()));
}
BitSet rowsToDelete = new BitSet(5);
rowsToDelete.set(1);
rowsToDelete.set(3);
rowsToDelete.set(4);
File newFile = temporary.resolve(randomUUID().toString()).toFile();
OrcFileInfo info = OrcFileRewriter.rewrite(file, newFile, rowsToDelete);
assertEquals(info.getRowCount(), 2);
assertEquals(info.getUncompressedSize(), 94);
try (OrcDataSource dataSource = fileOrcDataSource(newFile)) {
OrcRecordReader reader = createReader(dataSource, columnIds, columnTypes);
assertEquals(reader.getReaderRowCount(), 2);
assertEquals(reader.getFileRowCount(), 2);
assertEquals(reader.getSplitLength(), newFile.length());
Page page = reader.nextPage();
assertEquals(page.getPositionCount(), 2);
Block column0 = page.getBlock(0);
assertEquals(column0.getPositionCount(), 2);
for (int i = 0; i < 2; i++) {
assertEquals(column0.isNull(i), false);
}
assertEquals(BIGINT.getLong(column0, 0), 123L);
assertEquals(BIGINT.getLong(column0, 1), 456L);
Block column1 = page.getBlock(1);
assertEquals(column1.getPositionCount(), 2);
for (int i = 0; i < 2; i++) {
assertEquals(column1.isNull(i), false);
}
assertEquals(createVarcharType(20).getSlice(column1, 0), utf8Slice("hello"));
assertEquals(createVarcharType(20).getSlice(column1, 1), utf8Slice("bye"));
Block column2 = page.getBlock(2);
assertEquals(column2.getPositionCount(), 2);
for (int i = 0; i < 2; i++) {
assertEquals(column2.isNull(i), false);
}
assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 0), arrayBlockOf(BIGINT, 1, 2)));
assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column2, 1), arrayBlockOf(BIGINT, 5, 6)));
Block column3 = page.getBlock(3);
assertEquals(column3.getPositionCount(), 2);
for (int i = 0; i < 2; i++) {
assertEquals(column3.isNull(i), false);
}
assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 0), mapBlockOf(createVarcharType(5), BOOLEAN, "k1", true)));
assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column3, 1), mapBlockOf(createVarcharType(5), BOOLEAN, "k3", true)));
Block column4 = page.getBlock(4);
assertEquals(column4.getPositionCount(), 2);
for (int i = 0; i < 2; i++) {
assertEquals(column4.isNull(i), false);
}
assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 0), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 5))));
assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column4, 1), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 7))));
assertEquals(reader.nextPage(), null);
OrcFileMetadata orcFileMetadata = METADATA_CODEC.fromJson(reader.getUserMetadata().get(OrcFileMetadata.KEY).getBytes());
assertEquals(orcFileMetadata, new OrcFileMetadata(ImmutableMap.<Long, TypeId>builder().put(3L, BIGINT.getTypeId()).put(7L, createVarcharType(20).getTypeId()).put(9L, arrayType.getTypeId()).put(10L, mapType.getTypeId()).put(11L, arrayOfArrayType.getTypeId()).put(12L, decimalType.getTypeId()).buildOrThrow()));
}
}
use of io.trino.orc.OrcDataSource in project trino by trinodb.
the class TestShardWriter method testWriter.
@Test
public void testWriter() throws Exception {
List<Long> columnIds = ImmutableList.of(1L, 2L, 4L, 6L, 7L, 8L, 9L, 10L);
ArrayType arrayType = new ArrayType(BIGINT);
ArrayType arrayOfArrayType = new ArrayType(arrayType);
Type mapType = TESTING_TYPE_MANAGER.getParameterizedType(StandardTypes.MAP, ImmutableList.of(TypeSignatureParameter.typeParameter(createVarcharType(10).getTypeSignature()), TypeSignatureParameter.typeParameter(BOOLEAN.getTypeSignature())));
List<Type> columnTypes = ImmutableList.of(BIGINT, createVarcharType(10), VARBINARY, DOUBLE, BOOLEAN, arrayType, mapType, arrayOfArrayType);
File file = directory.resolve(System.nanoTime() + ".orc").toFile();
byte[] bytes1 = octets(0x00, 0xFE, 0xFF);
byte[] bytes3 = octets(0x01, 0x02, 0x19, 0x80);
RowPagesBuilder rowPagesBuilder = RowPagesBuilder.rowPagesBuilder(columnTypes).row(123L, "hello", wrappedBuffer(bytes1), 123.456, true, arrayBlockOf(BIGINT, 1, 2), mapBlockOf(createVarcharType(5), BOOLEAN, "k1", true), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 5))).row(null, "world", null, Double.POSITIVE_INFINITY, null, arrayBlockOf(BIGINT, 3, null), mapBlockOf(createVarcharType(5), BOOLEAN, "k2", null), arrayBlockOf(arrayType, null, arrayBlockOf(BIGINT, 6, 7))).row(456L, "bye \u2603", wrappedBuffer(bytes3), Double.NaN, false, arrayBlockOf(BIGINT), mapBlockOf(createVarcharType(5), BOOLEAN, "k3", false), arrayBlockOf(arrayType, arrayBlockOf(BIGINT)));
try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(new EmptyClassLoader());
OrcFileWriter writer = new OrcFileWriter(columnIds, columnTypes, file)) {
writer.appendPages(rowPagesBuilder.build());
}
try (OrcDataSource dataSource = fileOrcDataSource(file)) {
OrcRecordReader reader = createReader(dataSource, columnIds, columnTypes);
assertEquals(reader.getReaderRowCount(), 3);
assertEquals(reader.getReaderPosition(), 0);
assertEquals(reader.getFileRowCount(), reader.getReaderRowCount());
assertEquals(reader.getFilePosition(), reader.getFilePosition());
Page page = reader.nextPage();
assertEquals(page.getPositionCount(), 3);
assertEquals(reader.getReaderPosition(), 0);
assertEquals(reader.getFilePosition(), reader.getFilePosition());
Block column0 = page.getBlock(0);
assertEquals(column0.isNull(0), false);
assertEquals(column0.isNull(1), true);
assertEquals(column0.isNull(2), false);
assertEquals(BIGINT.getLong(column0, 0), 123L);
assertEquals(BIGINT.getLong(column0, 2), 456L);
Block column1 = page.getBlock(1);
assertEquals(createVarcharType(10).getSlice(column1, 0), utf8Slice("hello"));
assertEquals(createVarcharType(10).getSlice(column1, 1), utf8Slice("world"));
assertEquals(createVarcharType(10).getSlice(column1, 2), utf8Slice("bye \u2603"));
Block column2 = page.getBlock(2);
assertEquals(VARBINARY.getSlice(column2, 0), wrappedBuffer(bytes1));
assertEquals(column2.isNull(1), true);
assertEquals(VARBINARY.getSlice(column2, 2), wrappedBuffer(bytes3));
Block column3 = page.getBlock(3);
assertEquals(column3.isNull(0), false);
assertEquals(column3.isNull(1), false);
assertEquals(column3.isNull(2), false);
assertEquals(DOUBLE.getDouble(column3, 0), 123.456);
assertEquals(DOUBLE.getDouble(column3, 1), Double.POSITIVE_INFINITY);
assertEquals(DOUBLE.getDouble(column3, 2), Double.NaN);
Block column4 = page.getBlock(4);
assertEquals(column4.isNull(0), false);
assertEquals(column4.isNull(1), true);
assertEquals(column4.isNull(2), false);
assertEquals(BOOLEAN.getBoolean(column4, 0), true);
assertEquals(BOOLEAN.getBoolean(column4, 2), false);
Block column5 = page.getBlock(5);
assertEquals(column5.getPositionCount(), 3);
assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column5, 0), arrayBlockOf(BIGINT, 1, 2)));
assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column5, 1), arrayBlockOf(BIGINT, 3, null)));
assertTrue(arrayBlocksEqual(BIGINT, arrayType.getObject(column5, 2), arrayBlockOf(BIGINT)));
Block column6 = page.getBlock(6);
assertEquals(column6.getPositionCount(), 3);
assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column6, 0), mapBlockOf(createVarcharType(5), BOOLEAN, "k1", true)));
Block object = arrayType.getObject(column6, 1);
Block k2 = mapBlockOf(createVarcharType(5), BOOLEAN, "k2", null);
assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, object, k2));
assertTrue(mapBlocksEqual(createVarcharType(5), BOOLEAN, arrayType.getObject(column6, 2), mapBlockOf(createVarcharType(5), BOOLEAN, "k3", false)));
Block column7 = page.getBlock(7);
assertEquals(column7.getPositionCount(), 3);
assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column7, 0), arrayBlockOf(arrayType, arrayBlockOf(BIGINT, 5))));
assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column7, 1), arrayBlockOf(arrayType, null, arrayBlockOf(BIGINT, 6, 7))));
assertTrue(arrayBlocksEqual(arrayType, arrayOfArrayType.getObject(column7, 2), arrayBlockOf(arrayType, arrayBlockOf(BIGINT))));
assertNull(reader.nextPage());
assertEquals(reader.getReaderPosition(), 3);
assertEquals(reader.getFilePosition(), reader.getFilePosition());
OrcFileMetadata orcFileMetadata = METADATA_CODEC.fromJson(reader.getUserMetadata().get(OrcFileMetadata.KEY).getBytes());
assertEquals(orcFileMetadata, new OrcFileMetadata(ImmutableMap.<Long, TypeId>builder().put(1L, BIGINT.getTypeId()).put(2L, createVarcharType(10).getTypeId()).put(4L, VARBINARY.getTypeId()).put(6L, DOUBLE.getTypeId()).put(7L, BOOLEAN.getTypeId()).put(8L, arrayType.getTypeId()).put(9L, mapType.getTypeId()).put(10L, arrayOfArrayType.getTypeId()).buildOrThrow()));
}
File crcFile = new File(file.getParentFile(), "." + file.getName() + ".crc");
assertFalse(crcFile.exists());
}
use of io.trino.orc.OrcDataSource in project trino by trinodb.
the class RaptorStorageManager method getPageSource.
@Override
public ConnectorPageSource getPageSource(UUID shardUuid, OptionalInt bucketNumber, List<Long> columnIds, List<Type> columnTypes, TupleDomain<RaptorColumnHandle> effectivePredicate, OrcReaderOptions orcReaderOptions, OptionalLong transactionId) {
orcReaderOptions = orcReaderOptions.withMaxReadBlockSize(HUGE_MAX_READ_BLOCK_SIZE);
OrcDataSource dataSource = openShard(shardUuid, orcReaderOptions);
AggregatedMemoryContext memoryUsage = newSimpleAggregatedMemoryContext();
try {
OrcReader reader = OrcReader.createOrcReader(dataSource, orcReaderOptions).orElseThrow(() -> new TrinoException(RAPTOR_ERROR, "Data file is empty for shard " + shardUuid));
Map<Long, OrcColumn> indexMap = columnIdIndex(reader.getRootColumn().getNestedColumns());
List<OrcColumn> fileReadColumn = new ArrayList<>(columnIds.size());
List<Type> fileReadTypes = new ArrayList<>(columnIds.size());
List<ColumnAdaptation> columnAdaptations = new ArrayList<>(columnIds.size());
for (int i = 0; i < columnIds.size(); i++) {
long columnId = columnIds.get(i);
if (isHiddenColumn(columnId)) {
columnAdaptations.add(specialColumnAdaptation(columnId, shardUuid, bucketNumber));
continue;
}
Type type = toOrcFileType(columnTypes.get(i), typeManager);
OrcColumn fileColumn = indexMap.get(columnId);
if (fileColumn == null) {
columnAdaptations.add(ColumnAdaptation.nullColumn(type));
} else {
int sourceIndex = fileReadColumn.size();
columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex));
fileReadColumn.add(fileColumn);
fileReadTypes.add(type);
}
}
OrcPredicate predicate = getPredicate(effectivePredicate, indexMap);
OrcRecordReader recordReader = reader.createRecordReader(fileReadColumn, fileReadTypes, predicate, UTC, memoryUsage, INITIAL_BATCH_SIZE, RaptorPageSource::handleException);
Optional<ShardRewriter> shardRewriter = Optional.empty();
if (transactionId.isPresent()) {
shardRewriter = Optional.of(createShardRewriter(transactionId.getAsLong(), bucketNumber, shardUuid));
}
return new RaptorPageSource(shardRewriter, recordReader, columnAdaptations, dataSource, memoryUsage);
} catch (IOException | RuntimeException e) {
closeQuietly(dataSource);
throw new TrinoException(RAPTOR_ERROR, "Failed to create page source for shard " + shardUuid, e);
} catch (Throwable t) {
closeQuietly(dataSource);
throw t;
}
}
Aggregations