Search in sources :

Example 41 with ColumnStatistics

use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.

the class OrcWriter method bufferStripeData.

/**
 * Collect the data for the stripe.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
    if (stripeRowCount == 0) {
        verify(flushReason == CLOSED, "An empty stripe is not allowed");
        // column writers must be closed or the reset call will fail
        columnWriters.forEach(ColumnWriter::close);
        return ImmutableList.of();
    }
    if (rowGroupRowCount > 0) {
        finishRowGroup();
    }
    // convert any dictionary encoded column with a low compression ratio to direct
    dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
    columnWriters.forEach(ColumnWriter::close);
    List<OrcDataOutput> outputData = new ArrayList<>();
    List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
    // get index streams
    long indexLength = 0;
    for (ColumnWriter columnWriter : columnWriters) {
        for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
            // The ordering is critical because the stream only contain a length with no offset.
            outputData.add(indexStream);
            allStreams.add(indexStream.getStream());
            indexLength += indexStream.size();
        }
        for (StreamDataOutput bloomFilter : columnWriter.getBloomFilters(metadataWriter)) {
            outputData.add(bloomFilter);
            allStreams.add(bloomFilter.getStream());
            indexLength += bloomFilter.size();
        }
    }
    // data streams (sorted by size)
    long dataLength = 0;
    List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
    for (ColumnWriter columnWriter : columnWriters) {
        List<StreamDataOutput> streams = columnWriter.getDataStreams();
        dataStreams.addAll(streams);
        dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
    }
    Collections.sort(dataStreams);
    // add data streams
    for (StreamDataOutput dataStream : dataStreams) {
        // The ordering is critical because the stream only contain a length with no offset.
        outputData.add(dataStream);
        allStreams.add(dataStream.getStream());
    }
    Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
    Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
    // the 0th column is a struct column for the whole row
    columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
    columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null, null));
    // add footer
    StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
    Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
    outputData.add(createDataOutput(footer));
    // create final stripe statistics
    StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
    recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
    StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
    ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
    closedStripes.add(closedStripe);
    closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
    recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
    stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
    return outputData;
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.trino.orc.metadata.OrcColumnId) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StripeStatistics(io.trino.orc.metadata.statistics.StripeStatistics) ColumnWriters.createColumnWriter(io.trino.orc.writer.ColumnWriters.createColumnWriter) ColumnWriter(io.trino.orc.writer.ColumnWriter) SliceDictionaryColumnWriter(io.trino.orc.writer.SliceDictionaryColumnWriter) StreamDataOutput(io.trino.orc.stream.StreamDataOutput) OrcDataOutput(io.trino.orc.stream.OrcDataOutput) ColumnEncoding(io.trino.orc.metadata.ColumnEncoding) StripeFooter(io.trino.orc.metadata.StripeFooter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) Stream(io.trino.orc.metadata.Stream) StripeInformation(io.trino.orc.metadata.StripeInformation)

Example 42 with ColumnStatistics

use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.

the class OrcWriter method toFileStats.

private static Optional<ColumnMetadata<ColumnStatistics>> toFileStats(List<ColumnMetadata<ColumnStatistics>> stripes) {
    if (stripes.isEmpty()) {
        return Optional.empty();
    }
    int columnCount = stripes.get(0).size();
    checkArgument(stripes.stream().allMatch(stripe -> columnCount == stripe.size()));
    ImmutableList.Builder<ColumnStatistics> fileStats = ImmutableList.builder();
    for (int i = 0; i < columnCount; i++) {
        OrcColumnId columnId = new OrcColumnId(i);
        fileStats.add(ColumnStatistics.mergeColumnStatistics(stripes.stream().map(stripe -> stripe.get(columnId)).collect(toList())));
    }
    return Optional.of(new ColumnMetadata<>(fileStats.build()));
}
Also used : ColumnEncoding(io.trino.orc.metadata.ColumnEncoding) DICTIONARY_FULL(io.trino.orc.OrcWriterStats.FlushReason.DICTIONARY_FULL) DIRECT(io.trino.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT) OrcWriteValidationBuilder(io.trino.orc.OrcWriteValidation.OrcWriteValidationBuilder) Stream(io.trino.orc.metadata.Stream) StreamDataOutput(io.trino.orc.stream.StreamDataOutput) FlushReason(io.trino.orc.OrcWriterStats.FlushReason) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) BloomFilterBuilder(io.trino.orc.metadata.statistics.BloomFilterBuilder) Slices(io.airlift.slice.Slices) Map(java.util.Map) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) MAGIC(io.trino.orc.metadata.PostScript.MAGIC) ROOT_COLUMN(io.trino.orc.metadata.OrcColumnId.ROOT_COLUMN) OrcMetadataWriter(io.trino.orc.metadata.OrcMetadataWriter) ImmutableSet(com.google.common.collect.ImmutableSet) OrcDataOutput.createDataOutput(io.trino.orc.stream.OrcDataOutput.createDataOutput) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ColumnWriters.createColumnWriter(io.trino.orc.writer.ColumnWriters.createColumnWriter) ColumnWriter(io.trino.orc.writer.ColumnWriter) Collectors(java.util.stream.Collectors) ZoneId(java.time.ZoneId) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Metadata(io.trino.orc.metadata.Metadata) List(java.util.List) ClassLayout(org.openjdk.jol.info.ClassLayout) Entry(java.util.Map.Entry) Optional(java.util.Optional) ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) Slice(io.airlift.slice.Slice) OrcWriteValidationMode(io.trino.orc.OrcWriteValidation.OrcWriteValidationMode) SliceDictionaryColumnWriter(io.trino.orc.writer.SliceDictionaryColumnWriter) OrcDataOutput(io.trino.orc.stream.OrcDataOutput) Type(io.trino.spi.type.Type) Page(io.trino.spi.Page) HashMap(java.util.HashMap) CLOSED(io.trino.orc.OrcWriterStats.FlushReason.CLOSED) OptionalInt(java.util.OptionalInt) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) Utf8BloomFilterBuilder(io.trino.orc.metadata.statistics.Utf8BloomFilterBuilder) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) StripeInformation(io.trino.orc.metadata.StripeInformation) Math.toIntExact(java.lang.Math.toIntExact) Nullable(javax.annotation.Nullable) MAX_ROWS(io.trino.orc.OrcWriterStats.FlushReason.MAX_ROWS) StripeFooter(io.trino.orc.metadata.StripeFooter) Integer.min(java.lang.Integer.min) OrcType(io.trino.orc.metadata.OrcType) UnsignedBytes(com.google.common.primitives.UnsignedBytes) MAX_BYTES(io.trino.orc.OrcWriterStats.FlushReason.MAX_BYTES) IOException(java.io.IOException) ColumnMetadata(io.trino.orc.metadata.ColumnMetadata) CompressionKind(io.trino.orc.metadata.CompressionKind) Footer(io.trino.orc.metadata.Footer) Consumer(java.util.function.Consumer) Collectors.toList(java.util.stream.Collectors.toList) StripeStatistics(io.trino.orc.metadata.statistics.StripeStatistics) Closeable(java.io.Closeable) NoOpBloomFilterBuilder(io.trino.orc.metadata.statistics.NoOpBloomFilterBuilder) OrcReader.validateFile(io.trino.orc.OrcReader.validateFile) Collections(java.util.Collections) CompressedMetadataWriter(io.trino.orc.metadata.CompressedMetadataWriter) OrcColumnId(io.trino.orc.metadata.OrcColumnId) ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.trino.orc.metadata.OrcColumnId) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList)

Example 43 with ColumnStatistics

use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.

the class OrcWriter method bufferFileFooter.

/**
 * Collect the data for the file footer.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<OrcDataOutput> bufferFileFooter() throws IOException {
    List<OrcDataOutput> outputData = new ArrayList<>();
    Metadata metadata = new Metadata(closedStripes.stream().map(ClosedStripe::getStatistics).map(Optional::of).collect(toList()));
    Slice metadataSlice = metadataWriter.writeMetadata(metadata);
    outputData.add(createDataOutput(metadataSlice));
    fileStats = toFileStats(closedStripes.stream().map(ClosedStripe::getStatistics).map(StripeStatistics::getColumnStatistics).collect(toList()));
    fileStatsRetainedBytes = fileStats.map(stats -> stats.stream().mapToLong(ColumnStatistics::getRetainedSizeInBytes).sum()).orElse(0L);
    recordValidation(validation -> validation.setFileStatistics(fileStats));
    Map<String, Slice> userMetadata = this.userMetadata.entrySet().stream().collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue())));
    Footer footer = new Footer(fileRowCount, rowGroupMaxRowCount == 0 ? OptionalInt.empty() : OptionalInt.of(rowGroupMaxRowCount), closedStripes.stream().map(ClosedStripe::getStripeInformation).collect(toImmutableList()), orcTypes, fileStats, userMetadata, // writer id will be set by MetadataWriter
    Optional.empty());
    closedStripes.clear();
    closedStripesRetainedBytes = 0;
    Slice footerSlice = metadataWriter.writeFooter(footer);
    outputData.add(createDataOutput(footerSlice));
    recordValidation(validation -> validation.setVersion(metadataWriter.getOrcMetadataVersion()));
    Slice postscriptSlice = metadataWriter.writePostscript(footerSlice.length(), metadataSlice.length(), compression, maxCompressionBufferSize);
    outputData.add(createDataOutput(postscriptSlice));
    outputData.add(createDataOutput(Slices.wrappedBuffer(UnsignedBytes.checkedCast(postscriptSlice.length()))));
    return outputData;
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) ColumnEncoding(io.trino.orc.metadata.ColumnEncoding) DICTIONARY_FULL(io.trino.orc.OrcWriterStats.FlushReason.DICTIONARY_FULL) DIRECT(io.trino.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT) OrcWriteValidationBuilder(io.trino.orc.OrcWriteValidation.OrcWriteValidationBuilder) Stream(io.trino.orc.metadata.Stream) StreamDataOutput(io.trino.orc.stream.StreamDataOutput) FlushReason(io.trino.orc.OrcWriterStats.FlushReason) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) BloomFilterBuilder(io.trino.orc.metadata.statistics.BloomFilterBuilder) Slices(io.airlift.slice.Slices) Map(java.util.Map) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) MAGIC(io.trino.orc.metadata.PostScript.MAGIC) ROOT_COLUMN(io.trino.orc.metadata.OrcColumnId.ROOT_COLUMN) OrcMetadataWriter(io.trino.orc.metadata.OrcMetadataWriter) ImmutableSet(com.google.common.collect.ImmutableSet) OrcDataOutput.createDataOutput(io.trino.orc.stream.OrcDataOutput.createDataOutput) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ColumnWriters.createColumnWriter(io.trino.orc.writer.ColumnWriters.createColumnWriter) ColumnWriter(io.trino.orc.writer.ColumnWriter) Collectors(java.util.stream.Collectors) ZoneId(java.time.ZoneId) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Metadata(io.trino.orc.metadata.Metadata) List(java.util.List) ClassLayout(org.openjdk.jol.info.ClassLayout) Entry(java.util.Map.Entry) Optional(java.util.Optional) ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) Slice(io.airlift.slice.Slice) OrcWriteValidationMode(io.trino.orc.OrcWriteValidation.OrcWriteValidationMode) SliceDictionaryColumnWriter(io.trino.orc.writer.SliceDictionaryColumnWriter) OrcDataOutput(io.trino.orc.stream.OrcDataOutput) Type(io.trino.spi.type.Type) Page(io.trino.spi.Page) HashMap(java.util.HashMap) CLOSED(io.trino.orc.OrcWriterStats.FlushReason.CLOSED) OptionalInt(java.util.OptionalInt) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) Utf8BloomFilterBuilder(io.trino.orc.metadata.statistics.Utf8BloomFilterBuilder) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) StripeInformation(io.trino.orc.metadata.StripeInformation) Math.toIntExact(java.lang.Math.toIntExact) Nullable(javax.annotation.Nullable) MAX_ROWS(io.trino.orc.OrcWriterStats.FlushReason.MAX_ROWS) StripeFooter(io.trino.orc.metadata.StripeFooter) Integer.min(java.lang.Integer.min) OrcType(io.trino.orc.metadata.OrcType) UnsignedBytes(com.google.common.primitives.UnsignedBytes) MAX_BYTES(io.trino.orc.OrcWriterStats.FlushReason.MAX_BYTES) IOException(java.io.IOException) ColumnMetadata(io.trino.orc.metadata.ColumnMetadata) CompressionKind(io.trino.orc.metadata.CompressionKind) Footer(io.trino.orc.metadata.Footer) Consumer(java.util.function.Consumer) Collectors.toList(java.util.stream.Collectors.toList) StripeStatistics(io.trino.orc.metadata.statistics.StripeStatistics) Closeable(java.io.Closeable) NoOpBloomFilterBuilder(io.trino.orc.metadata.statistics.NoOpBloomFilterBuilder) OrcReader.validateFile(io.trino.orc.OrcReader.validateFile) Collections(java.util.Collections) CompressedMetadataWriter(io.trino.orc.metadata.CompressedMetadataWriter) OrcColumnId(io.trino.orc.metadata.OrcColumnId) Optional(java.util.Optional) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) ArrayList(java.util.ArrayList) Metadata(io.trino.orc.metadata.Metadata) ColumnMetadata(io.trino.orc.metadata.ColumnMetadata) StripeFooter(io.trino.orc.metadata.StripeFooter) Footer(io.trino.orc.metadata.Footer) OrcDataOutput(io.trino.orc.stream.OrcDataOutput)

Example 44 with ColumnStatistics

use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.

the class StripeReader method readColumnIndexes.

private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData, Map<OrcColumnId, List<BloomFilter>> bloomFilterIndexes) throws IOException {
    ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == ROW_INDEX) {
            OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
            List<BloomFilter> bloomFilters = bloomFilterIndexes.get(entry.getKey().getColumnId());
            List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
            if (bloomFilters != null && !bloomFilters.isEmpty()) {
                ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder();
                for (int i = 0; i < rowGroupIndexes.size(); i++) {
                    RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i);
                    ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics().withBloomFilter(bloomFilters.get(i));
                    newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics));
                }
                rowGroupIndexes = newRowGroupIndexes.build();
            }
            columnIndexes.put(entry.getKey(), rowGroupIndexes);
        }
    }
    return columnIndexes.buildOrThrow();
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) OrcInputStream(io.trino.orc.stream.OrcInputStream) ImmutableList(com.google.common.collect.ImmutableList) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) BloomFilter(io.trino.orc.metadata.statistics.BloomFilter) StreamCheckpoint(io.trino.orc.checkpoint.StreamCheckpoint) Checkpoints.getDictionaryStreamCheckpoint(io.trino.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) RowGroupIndex(io.trino.orc.metadata.RowGroupIndex) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Stream(io.trino.orc.metadata.Stream) OrcInputStream(io.trino.orc.stream.OrcInputStream) ValueInputStream(io.trino.orc.stream.ValueInputStream) InputStream(java.io.InputStream)

Example 45 with ColumnStatistics

use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.

the class StripeReader method selectRowGroups.

private Set<Integer> selectRowGroups(StripeInformation stripe, Map<StreamId, List<RowGroupIndex>> columnIndexes) {
    int rowsInRowGroup = this.rowsInRowGroup.orElseThrow(() -> new IllegalStateException("Cannot create row groups if row group info is missing"));
    int rowsInStripe = stripe.getNumberOfRows();
    int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup);
    ImmutableSet.Builder<Integer> selectedRowGroups = ImmutableSet.builder();
    int remainingRows = rowsInStripe;
    for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) {
        int rows = Math.min(remainingRows, rowsInRowGroup);
        ColumnMetadata<ColumnStatistics> statistics = getRowGroupStatistics(types, columnIndexes, rowGroup);
        if (predicate.matches(rows, statistics)) {
            selectedRowGroups.add(rowGroup);
        }
        remainingRows -= rows;
    }
    return selectedRowGroups.build();
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) ImmutableSet(com.google.common.collect.ImmutableSet) StreamCheckpoint(io.trino.orc.checkpoint.StreamCheckpoint) Checkpoints.getDictionaryStreamCheckpoint(io.trino.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint)

Aggregations

ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)45 Slice (io.airlift.slice.Slice)23 Stream (io.trino.orc.metadata.Stream)23 StreamDataOutput (io.trino.orc.stream.StreamDataOutput)20 ArrayList (java.util.ArrayList)20 ImmutableList (com.google.common.collect.ImmutableList)19 List (java.util.List)19 PresentOutputStream (io.trino.orc.stream.PresentOutputStream)17 RowGroupIndex (io.trino.orc.metadata.RowGroupIndex)15 BooleanStreamCheckpoint (io.trino.orc.checkpoint.BooleanStreamCheckpoint)12 OrcColumnId (io.trino.orc.metadata.OrcColumnId)12 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)9 LongOutputStream (io.trino.orc.stream.LongOutputStream)9 BloomFilter (io.trino.orc.metadata.statistics.BloomFilter)8 ImmutableMap (com.google.common.collect.ImmutableMap)7 LongStreamCheckpoint (io.trino.orc.checkpoint.LongStreamCheckpoint)7 ColumnEncoding (io.trino.orc.metadata.ColumnEncoding)5 ColumnMetadata (io.trino.orc.metadata.ColumnMetadata)5 StripeFooter (io.trino.orc.metadata.StripeFooter)5 LongOutputStream.createLengthOutputStream (io.trino.orc.stream.LongOutputStream.createLengthOutputStream)5