Search in sources :

Example 1 with OrcMetadataReader

use of io.trino.orc.metadata.OrcMetadataReader in project trino by trinodb.

the class TestSliceDictionaryColumnReader method testDictionaryReaderUpdatesRetainedSize.

@Test
public void testDictionaryReaderUpdatesRetainedSize() throws Exception {
    // create orc file
    List<String> values = createValues();
    File temporaryDirectory = createTempDir();
    File orcFile = new File(temporaryDirectory, randomUUID().toString());
    writeOrcColumnTrino(orcFile, NONE, VARCHAR, values.iterator(), new OrcWriterStats());
    // prepare for read
    OrcDataSource dataSource = new MemoryOrcDataSource(new OrcDataSourceId(orcFile.getPath()), Slices.wrappedBuffer(readAllBytes(orcFile.toPath())));
    OrcReader orcReader = OrcReader.createOrcReader(dataSource, new OrcReaderOptions()).orElseThrow(() -> new RuntimeException("File is empty"));
    Footer footer = orcReader.getFooter();
    List<OrcColumn> columns = orcReader.getRootColumn().getNestedColumns();
    assertTrue(columns.size() == 1);
    StripeReader stripeReader = new StripeReader(dataSource, UTC, Optional.empty(), footer.getTypes(), ImmutableSet.copyOf(columns), footer.getRowsInRowGroup(), OrcPredicate.TRUE, ORIGINAL, new OrcMetadataReader(), Optional.empty());
    AggregatedMemoryContext memoryContext = newSimpleAggregatedMemoryContext();
    SliceDictionaryColumnReader columnReader = new SliceDictionaryColumnReader(columns.get(0), memoryContext.newLocalMemoryContext(TestSliceDictionaryColumnReader.class.getSimpleName()), -1, false);
    List<StripeInformation> stripeInformations = footer.getStripes();
    for (StripeInformation stripeInformation : stripeInformations) {
        Stripe stripe = stripeReader.readStripe(stripeInformation, newSimpleAggregatedMemoryContext());
        List<RowGroup> rowGroups = stripe.getRowGroups();
        columnReader.startStripe(stripe.getFileTimeZone(), stripe.getDictionaryStreamSources(), stripe.getColumnEncodings());
        for (RowGroup rowGroup : rowGroups) {
            columnReader.startRowGroup(rowGroup.getStreamSources());
            columnReader.prepareNextRead(1000);
            columnReader.readBlock();
            // memory usage check
            assertEquals(memoryContext.getBytes(), columnReader.getRetainedSizeInBytes());
        }
    }
    columnReader.close();
    assertTrue(memoryContext.getBytes() == 0);
}
Also used : SliceDictionaryColumnReader(io.trino.orc.reader.SliceDictionaryColumnReader) OrcMetadataReader(io.trino.orc.metadata.OrcMetadataReader) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) Footer(io.trino.orc.metadata.Footer) File(java.io.File) StripeInformation(io.trino.orc.metadata.StripeInformation) Test(org.testng.annotations.Test)

Example 2 with OrcMetadataReader

use of io.trino.orc.metadata.OrcMetadataReader in project trino by trinodb.

the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.

@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
    BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05);
    bloomFilterWrite.add(TEST_STRING);
    assertTrue(bloomFilterWrite.test(TEST_STRING));
    assertTrue(bloomFilterWrite.testSlice(wrappedBuffer(TEST_STRING)));
    Slice bloomFilterBytes = new CompressedMetadataWriter(new OrcMetadataWriter(WriterIdentification.TRINO), CompressionKind.NONE, 1024).writeBloomFilters(ImmutableList.of(bloomFilterWrite));
    // Read through method
    InputStream inputStream = bloomFilterBytes.getInput();
    OrcMetadataReader metadataReader = new OrcMetadataReader();
    List<BloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
    assertEquals(bloomFilters.size(), 1);
    assertTrue(bloomFilters.get(0).test(TEST_STRING));
    assertTrue(bloomFilters.get(0).testSlice(wrappedBuffer(TEST_STRING)));
    assertFalse(bloomFilters.get(0).test(TEST_STRING_NOT_WRITTEN));
    assertFalse(bloomFilters.get(0).testSlice(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
    assertEquals(bloomFilterWrite.getNumBits(), bloomFilters.get(0).getNumBits());
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
    // Validate bit set
    assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
    // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
    CodedInputStream input = CodedInputStream.newInstance(bloomFilterBytes.getBytes());
    OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
    List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
    assertEquals(bloomFilterList.size(), 1);
    OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
    // Validate contents of ORC bloom filter bit set
    assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
    // hash functions
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
    // bit size
    assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
Also used : CompressedMetadataWriter(io.trino.orc.metadata.CompressedMetadataWriter) Slice(io.airlift.slice.Slice) CodedInputStream(io.trino.orc.protobuf.CodedInputStream) InputStream(java.io.InputStream) CodedInputStream(io.trino.orc.protobuf.CodedInputStream) OrcMetadataReader(io.trino.orc.metadata.OrcMetadataReader) OrcProto(io.trino.orc.proto.OrcProto) OrcMetadataWriter(io.trino.orc.metadata.OrcMetadataWriter) BloomFilter(io.trino.orc.metadata.statistics.BloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(io.trino.orc.TupleDomainOrcPredicate.checkInBloomFilter) Test(org.testng.annotations.Test)

Example 3 with OrcMetadataReader

use of io.trino.orc.metadata.OrcMetadataReader in project trino by trinodb.

the class TestOrcWriter method testWriteOutputStreamsInOrder.

@Test
public void testWriteOutputStreamsInOrder() throws IOException {
    for (OrcWriteValidationMode validationMode : OrcWriteValidationMode.values()) {
        TempFile tempFile = new TempFile();
        List<String> columnNames = ImmutableList.of("test1", "test2", "test3", "test4", "test5");
        List<Type> types = ImmutableList.of(VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR);
        OrcWriter writer = new OrcWriter(new OutputStreamOrcDataSink(new FileOutputStream(tempFile.getFile())), ImmutableList.of("test1", "test2", "test3", "test4", "test5"), types, OrcType.createRootOrcType(columnNames, types), NONE, new OrcWriterOptions().withStripeMinSize(DataSize.of(0, MEGABYTE)).withStripeMaxSize(DataSize.of(32, MEGABYTE)).withStripeMaxRowCount(ORC_STRIPE_SIZE).withRowGroupMaxRowCount(ORC_ROW_GROUP_SIZE).withDictionaryMaxMemory(DataSize.of(32, MEGABYTE)).withBloomFilterColumns(ImmutableSet.copyOf(columnNames)), ImmutableMap.of(), true, validationMode, new OrcWriterStats());
        // write down some data with unsorted streams
        String[] data = new String[] { "a", "bbbbb", "ccc", "dd", "eeee" };
        Block[] blocks = new Block[data.length];
        int entries = 65536;
        BlockBuilder blockBuilder = VARCHAR.createBlockBuilder(null, entries);
        for (int i = 0; i < data.length; i++) {
            byte[] bytes = data[i].getBytes(UTF_8);
            for (int j = 0; j < entries; j++) {
                // force to write different data
                bytes[0] = (byte) ((bytes[0] + 1) % 128);
                blockBuilder.writeBytes(Slices.wrappedBuffer(bytes, 0, bytes.length), 0, bytes.length);
                blockBuilder.closeEntry();
            }
            blocks[i] = blockBuilder.build();
            blockBuilder = blockBuilder.newBlockBuilderLike(null);
        }
        writer.write(new Page(blocks));
        writer.close();
        // read the footer and verify the streams are ordered by size
        OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), READER_OPTIONS);
        Footer footer = OrcReader.createOrcReader(orcDataSource, READER_OPTIONS).orElseThrow(() -> new RuntimeException("File is empty")).getFooter();
        // OrcReader closes the original data source because it buffers the full file, so we need to reopen
        orcDataSource = new FileOrcDataSource(tempFile.getFile(), READER_OPTIONS);
        for (StripeInformation stripe : footer.getStripes()) {
            // read the footer
            Slice tailBuffer = orcDataSource.readFully(stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(), toIntExact(stripe.getFooterLength()));
            try (InputStream inputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), tailBuffer, Optional.empty(), newSimpleAggregatedMemoryContext()))) {
                StripeFooter stripeFooter = new OrcMetadataReader().readStripeFooter(footer.getTypes(), inputStream, ZoneId.of("UTC"));
                int size = 0;
                boolean dataStreamStarted = false;
                for (Stream stream : stripeFooter.getStreams()) {
                    if (isIndexStream(stream)) {
                        assertFalse(dataStreamStarted);
                        continue;
                    }
                    dataStreamStarted = true;
                    // verify sizes in order
                    assertGreaterThanOrEqual(stream.getLength(), size);
                    size = stream.getLength();
                }
            }
        }
    }
}
Also used : Page(io.trino.spi.Page) OrcWriteValidationMode(io.trino.orc.OrcWriteValidation.OrcWriteValidationMode) StripeReader.isIndexStream(io.trino.orc.StripeReader.isIndexStream) Stream(io.trino.orc.metadata.Stream) OrcInputStream(io.trino.orc.stream.OrcInputStream) FileOutputStream(java.io.FileOutputStream) InputStream(java.io.InputStream) BlockBuilder(io.trino.spi.block.BlockBuilder) OrcInputStream(io.trino.orc.stream.OrcInputStream) OrcInputStream(io.trino.orc.stream.OrcInputStream) InputStream(java.io.InputStream) OrcMetadataReader(io.trino.orc.metadata.OrcMetadataReader) Type(io.trino.spi.type.Type) OrcType(io.trino.orc.metadata.OrcType) StripeFooter(io.trino.orc.metadata.StripeFooter) Slice(io.airlift.slice.Slice) FileOutputStream(java.io.FileOutputStream) StripeFooter(io.trino.orc.metadata.StripeFooter) Footer(io.trino.orc.metadata.Footer) Block(io.trino.spi.block.Block) StripeInformation(io.trino.orc.metadata.StripeInformation) Test(org.testng.annotations.Test)

Aggregations

OrcMetadataReader (io.trino.orc.metadata.OrcMetadataReader)3 Test (org.testng.annotations.Test)3 Slice (io.airlift.slice.Slice)2 Footer (io.trino.orc.metadata.Footer)2 StripeInformation (io.trino.orc.metadata.StripeInformation)2 InputStream (java.io.InputStream)2 AggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext)1 AggregatedMemoryContext.newSimpleAggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext)1 OrcWriteValidationMode (io.trino.orc.OrcWriteValidation.OrcWriteValidationMode)1 StripeReader.isIndexStream (io.trino.orc.StripeReader.isIndexStream)1 TupleDomainOrcPredicate.checkInBloomFilter (io.trino.orc.TupleDomainOrcPredicate.checkInBloomFilter)1 CompressedMetadataWriter (io.trino.orc.metadata.CompressedMetadataWriter)1 OrcMetadataWriter (io.trino.orc.metadata.OrcMetadataWriter)1 OrcType (io.trino.orc.metadata.OrcType)1 Stream (io.trino.orc.metadata.Stream)1 StripeFooter (io.trino.orc.metadata.StripeFooter)1 BloomFilter (io.trino.orc.metadata.statistics.BloomFilter)1 OrcProto (io.trino.orc.proto.OrcProto)1 CodedInputStream (io.trino.orc.protobuf.CodedInputStream)1 SliceDictionaryColumnReader (io.trino.orc.reader.SliceDictionaryColumnReader)1