use of io.trino.orc.metadata.OrcMetadataReader in project trino by trinodb.
the class TestSliceDictionaryColumnReader method testDictionaryReaderUpdatesRetainedSize.
@Test
public void testDictionaryReaderUpdatesRetainedSize() throws Exception {
// create orc file
List<String> values = createValues();
File temporaryDirectory = createTempDir();
File orcFile = new File(temporaryDirectory, randomUUID().toString());
writeOrcColumnTrino(orcFile, NONE, VARCHAR, values.iterator(), new OrcWriterStats());
// prepare for read
OrcDataSource dataSource = new MemoryOrcDataSource(new OrcDataSourceId(orcFile.getPath()), Slices.wrappedBuffer(readAllBytes(orcFile.toPath())));
OrcReader orcReader = OrcReader.createOrcReader(dataSource, new OrcReaderOptions()).orElseThrow(() -> new RuntimeException("File is empty"));
Footer footer = orcReader.getFooter();
List<OrcColumn> columns = orcReader.getRootColumn().getNestedColumns();
assertTrue(columns.size() == 1);
StripeReader stripeReader = new StripeReader(dataSource, UTC, Optional.empty(), footer.getTypes(), ImmutableSet.copyOf(columns), footer.getRowsInRowGroup(), OrcPredicate.TRUE, ORIGINAL, new OrcMetadataReader(), Optional.empty());
AggregatedMemoryContext memoryContext = newSimpleAggregatedMemoryContext();
SliceDictionaryColumnReader columnReader = new SliceDictionaryColumnReader(columns.get(0), memoryContext.newLocalMemoryContext(TestSliceDictionaryColumnReader.class.getSimpleName()), -1, false);
List<StripeInformation> stripeInformations = footer.getStripes();
for (StripeInformation stripeInformation : stripeInformations) {
Stripe stripe = stripeReader.readStripe(stripeInformation, newSimpleAggregatedMemoryContext());
List<RowGroup> rowGroups = stripe.getRowGroups();
columnReader.startStripe(stripe.getFileTimeZone(), stripe.getDictionaryStreamSources(), stripe.getColumnEncodings());
for (RowGroup rowGroup : rowGroups) {
columnReader.startRowGroup(rowGroup.getStreamSources());
columnReader.prepareNextRead(1000);
columnReader.readBlock();
// memory usage check
assertEquals(memoryContext.getBytes(), columnReader.getRetainedSizeInBytes());
}
}
columnReader.close();
assertTrue(memoryContext.getBytes() == 0);
}
use of io.trino.orc.metadata.OrcMetadataReader in project trino by trinodb.
the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.
@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05);
bloomFilterWrite.add(TEST_STRING);
assertTrue(bloomFilterWrite.test(TEST_STRING));
assertTrue(bloomFilterWrite.testSlice(wrappedBuffer(TEST_STRING)));
Slice bloomFilterBytes = new CompressedMetadataWriter(new OrcMetadataWriter(WriterIdentification.TRINO), CompressionKind.NONE, 1024).writeBloomFilters(ImmutableList.of(bloomFilterWrite));
// Read through method
InputStream inputStream = bloomFilterBytes.getInput();
OrcMetadataReader metadataReader = new OrcMetadataReader();
List<BloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
assertEquals(bloomFilters.size(), 1);
assertTrue(bloomFilters.get(0).test(TEST_STRING));
assertTrue(bloomFilters.get(0).testSlice(wrappedBuffer(TEST_STRING)));
assertFalse(bloomFilters.get(0).test(TEST_STRING_NOT_WRITTEN));
assertFalse(bloomFilters.get(0).testSlice(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
assertEquals(bloomFilterWrite.getNumBits(), bloomFilters.get(0).getNumBits());
assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
// Validate bit set
assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
// Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
CodedInputStream input = CodedInputStream.newInstance(bloomFilterBytes.getBytes());
OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
assertEquals(bloomFilterList.size(), 1);
OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
// Validate contents of ORC bloom filter bit set
assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
// hash functions
assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
// bit size
assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
use of io.trino.orc.metadata.OrcMetadataReader in project trino by trinodb.
the class TestOrcWriter method testWriteOutputStreamsInOrder.
@Test
public void testWriteOutputStreamsInOrder() throws IOException {
for (OrcWriteValidationMode validationMode : OrcWriteValidationMode.values()) {
TempFile tempFile = new TempFile();
List<String> columnNames = ImmutableList.of("test1", "test2", "test3", "test4", "test5");
List<Type> types = ImmutableList.of(VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR);
OrcWriter writer = new OrcWriter(new OutputStreamOrcDataSink(new FileOutputStream(tempFile.getFile())), ImmutableList.of("test1", "test2", "test3", "test4", "test5"), types, OrcType.createRootOrcType(columnNames, types), NONE, new OrcWriterOptions().withStripeMinSize(DataSize.of(0, MEGABYTE)).withStripeMaxSize(DataSize.of(32, MEGABYTE)).withStripeMaxRowCount(ORC_STRIPE_SIZE).withRowGroupMaxRowCount(ORC_ROW_GROUP_SIZE).withDictionaryMaxMemory(DataSize.of(32, MEGABYTE)).withBloomFilterColumns(ImmutableSet.copyOf(columnNames)), ImmutableMap.of(), true, validationMode, new OrcWriterStats());
// write down some data with unsorted streams
String[] data = new String[] { "a", "bbbbb", "ccc", "dd", "eeee" };
Block[] blocks = new Block[data.length];
int entries = 65536;
BlockBuilder blockBuilder = VARCHAR.createBlockBuilder(null, entries);
for (int i = 0; i < data.length; i++) {
byte[] bytes = data[i].getBytes(UTF_8);
for (int j = 0; j < entries; j++) {
// force to write different data
bytes[0] = (byte) ((bytes[0] + 1) % 128);
blockBuilder.writeBytes(Slices.wrappedBuffer(bytes, 0, bytes.length), 0, bytes.length);
blockBuilder.closeEntry();
}
blocks[i] = blockBuilder.build();
blockBuilder = blockBuilder.newBlockBuilderLike(null);
}
writer.write(new Page(blocks));
writer.close();
// read the footer and verify the streams are ordered by size
OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), READER_OPTIONS);
Footer footer = OrcReader.createOrcReader(orcDataSource, READER_OPTIONS).orElseThrow(() -> new RuntimeException("File is empty")).getFooter();
// OrcReader closes the original data source because it buffers the full file, so we need to reopen
orcDataSource = new FileOrcDataSource(tempFile.getFile(), READER_OPTIONS);
for (StripeInformation stripe : footer.getStripes()) {
// read the footer
Slice tailBuffer = orcDataSource.readFully(stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(), toIntExact(stripe.getFooterLength()));
try (InputStream inputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), tailBuffer, Optional.empty(), newSimpleAggregatedMemoryContext()))) {
StripeFooter stripeFooter = new OrcMetadataReader().readStripeFooter(footer.getTypes(), inputStream, ZoneId.of("UTC"));
int size = 0;
boolean dataStreamStarted = false;
for (Stream stream : stripeFooter.getStreams()) {
if (isIndexStream(stream)) {
assertFalse(dataStreamStarted);
continue;
}
dataStreamStarted = true;
// verify sizes in order
assertGreaterThanOrEqual(stream.getLength(), size);
size = stream.getLength();
}
}
}
}
}
Aggregations