Search in sources :

Example 6 with StripeInformation

use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.

the class TestOrcWriter method testWriteOutputStreamsInOrder.

@Test
public void testWriteOutputStreamsInOrder() throws IOException {
    for (OrcWriteValidationMode validationMode : OrcWriteValidationMode.values()) {
        TempFile tempFile = new TempFile();
        List<String> columnNames = ImmutableList.of("test1", "test2", "test3", "test4", "test5");
        List<Type> types = ImmutableList.of(VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR);
        OrcWriter writer = new OrcWriter(new OutputStreamOrcDataSink(new FileOutputStream(tempFile.getFile())), ImmutableList.of("test1", "test2", "test3", "test4", "test5"), types, OrcType.createRootOrcType(columnNames, types), NONE, new OrcWriterOptions().withStripeMinSize(DataSize.of(0, MEGABYTE)).withStripeMaxSize(DataSize.of(32, MEGABYTE)).withStripeMaxRowCount(ORC_STRIPE_SIZE).withRowGroupMaxRowCount(ORC_ROW_GROUP_SIZE).withDictionaryMaxMemory(DataSize.of(32, MEGABYTE)).withBloomFilterColumns(ImmutableSet.copyOf(columnNames)), ImmutableMap.of(), true, validationMode, new OrcWriterStats());
        // write down some data with unsorted streams
        String[] data = new String[] { "a", "bbbbb", "ccc", "dd", "eeee" };
        Block[] blocks = new Block[data.length];
        int entries = 65536;
        BlockBuilder blockBuilder = VARCHAR.createBlockBuilder(null, entries);
        for (int i = 0; i < data.length; i++) {
            byte[] bytes = data[i].getBytes(UTF_8);
            for (int j = 0; j < entries; j++) {
                // force to write different data
                bytes[0] = (byte) ((bytes[0] + 1) % 128);
                blockBuilder.writeBytes(Slices.wrappedBuffer(bytes, 0, bytes.length), 0, bytes.length);
                blockBuilder.closeEntry();
            }
            blocks[i] = blockBuilder.build();
            blockBuilder = blockBuilder.newBlockBuilderLike(null);
        }
        writer.write(new Page(blocks));
        writer.close();
        // read the footer and verify the streams are ordered by size
        OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), READER_OPTIONS);
        Footer footer = OrcReader.createOrcReader(orcDataSource, READER_OPTIONS).orElseThrow(() -> new RuntimeException("File is empty")).getFooter();
        // OrcReader closes the original data source because it buffers the full file, so we need to reopen
        orcDataSource = new FileOrcDataSource(tempFile.getFile(), READER_OPTIONS);
        for (StripeInformation stripe : footer.getStripes()) {
            // read the footer
            Slice tailBuffer = orcDataSource.readFully(stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(), toIntExact(stripe.getFooterLength()));
            try (InputStream inputStream = new OrcInputStream(OrcChunkLoader.create(orcDataSource.getId(), tailBuffer, Optional.empty(), newSimpleAggregatedMemoryContext()))) {
                StripeFooter stripeFooter = new OrcMetadataReader().readStripeFooter(footer.getTypes(), inputStream, ZoneId.of("UTC"));
                int size = 0;
                boolean dataStreamStarted = false;
                for (Stream stream : stripeFooter.getStreams()) {
                    if (isIndexStream(stream)) {
                        assertFalse(dataStreamStarted);
                        continue;
                    }
                    dataStreamStarted = true;
                    // verify sizes in order
                    assertGreaterThanOrEqual(stream.getLength(), size);
                    size = stream.getLength();
                }
            }
        }
    }
}
Also used : Page(io.trino.spi.Page) OrcWriteValidationMode(io.trino.orc.OrcWriteValidation.OrcWriteValidationMode) StripeReader.isIndexStream(io.trino.orc.StripeReader.isIndexStream) Stream(io.trino.orc.metadata.Stream) OrcInputStream(io.trino.orc.stream.OrcInputStream) FileOutputStream(java.io.FileOutputStream) InputStream(java.io.InputStream) BlockBuilder(io.trino.spi.block.BlockBuilder) OrcInputStream(io.trino.orc.stream.OrcInputStream) OrcInputStream(io.trino.orc.stream.OrcInputStream) InputStream(java.io.InputStream) OrcMetadataReader(io.trino.orc.metadata.OrcMetadataReader) Type(io.trino.spi.type.Type) OrcType(io.trino.orc.metadata.OrcType) StripeFooter(io.trino.orc.metadata.StripeFooter) Slice(io.airlift.slice.Slice) FileOutputStream(java.io.FileOutputStream) StripeFooter(io.trino.orc.metadata.StripeFooter) Footer(io.trino.orc.metadata.Footer) Block(io.trino.spi.block.Block) StripeInformation(io.trino.orc.metadata.StripeInformation) Test(org.testng.annotations.Test)

Example 7 with StripeInformation

use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.

the class OrcWriter method bufferStripeData.

/**
 * Collect the data for the stripe.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
    if (stripeRowCount == 0) {
        verify(flushReason == CLOSED, "An empty stripe is not allowed");
        // column writers must be closed or the reset call will fail
        columnWriters.forEach(ColumnWriter::close);
        return ImmutableList.of();
    }
    if (rowGroupRowCount > 0) {
        finishRowGroup();
    }
    // convert any dictionary encoded column with a low compression ratio to direct
    dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
    columnWriters.forEach(ColumnWriter::close);
    List<OrcDataOutput> outputData = new ArrayList<>();
    List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
    // get index streams
    long indexLength = 0;
    for (ColumnWriter columnWriter : columnWriters) {
        for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
            // The ordering is critical because the stream only contain a length with no offset.
            outputData.add(indexStream);
            allStreams.add(indexStream.getStream());
            indexLength += indexStream.size();
        }
        for (StreamDataOutput bloomFilter : columnWriter.getBloomFilters(metadataWriter)) {
            outputData.add(bloomFilter);
            allStreams.add(bloomFilter.getStream());
            indexLength += bloomFilter.size();
        }
    }
    // data streams (sorted by size)
    long dataLength = 0;
    List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
    for (ColumnWriter columnWriter : columnWriters) {
        List<StreamDataOutput> streams = columnWriter.getDataStreams();
        dataStreams.addAll(streams);
        dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
    }
    Collections.sort(dataStreams);
    // add data streams
    for (StreamDataOutput dataStream : dataStreams) {
        // The ordering is critical because the stream only contain a length with no offset.
        outputData.add(dataStream);
        allStreams.add(dataStream.getStream());
    }
    Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
    Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
    // the 0th column is a struct column for the whole row
    columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
    columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null, null));
    // add footer
    StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
    Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
    outputData.add(createDataOutput(footer));
    // create final stripe statistics
    StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
    recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
    StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
    ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
    closedStripes.add(closedStripe);
    closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
    recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
    stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
    return outputData;
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.trino.orc.metadata.OrcColumnId) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StripeStatistics(io.trino.orc.metadata.statistics.StripeStatistics) ColumnWriters.createColumnWriter(io.trino.orc.writer.ColumnWriters.createColumnWriter) ColumnWriter(io.trino.orc.writer.ColumnWriter) SliceDictionaryColumnWriter(io.trino.orc.writer.SliceDictionaryColumnWriter) StreamDataOutput(io.trino.orc.stream.StreamDataOutput) OrcDataOutput(io.trino.orc.stream.OrcDataOutput) ColumnEncoding(io.trino.orc.metadata.ColumnEncoding) StripeFooter(io.trino.orc.metadata.StripeFooter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) Stream(io.trino.orc.metadata.Stream) StripeInformation(io.trino.orc.metadata.StripeInformation)

Aggregations

StripeInformation (io.trino.orc.metadata.StripeInformation)7 Test (org.testng.annotations.Test)4 Slice (io.airlift.slice.Slice)2 DataSize (io.airlift.units.DataSize)2 ColumnEncoding (io.trino.orc.metadata.ColumnEncoding)2 Footer (io.trino.orc.metadata.Footer)2 OrcMetadataReader (io.trino.orc.metadata.OrcMetadataReader)2 Stream (io.trino.orc.metadata.Stream)2 StripeFooter (io.trino.orc.metadata.StripeFooter)2 Page (io.trino.spi.Page)2 Block (io.trino.spi.block.Block)2 Slices.utf8Slice (io.airlift.slice.Slices.utf8Slice)1 AggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext)1 AggregatedMemoryContext.newSimpleAggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext)1 OrcWriteValidationMode (io.trino.orc.OrcWriteValidation.OrcWriteValidationMode)1 StatisticsValidation (io.trino.orc.OrcWriteValidation.StatisticsValidation)1 StripeReader.isIndexStream (io.trino.orc.StripeReader.isIndexStream)1 OrcColumnId (io.trino.orc.metadata.OrcColumnId)1 OrcType (io.trino.orc.metadata.OrcType)1 ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)1