Search in sources :

Example 1 with StripeStatistics

use of io.prestosql.orc.metadata.statistics.StripeStatistics in project hetu-core by openlookeng.

the class OrcWriteValidation method validateStripeStatistics.

public void validateStripeStatistics(OrcDataSourceId orcDataSourceId, long stripeOffset, ColumnMetadata<ColumnStatistics> actual) throws OrcCorruptionException {
    StripeStatistics expected = stripeStatistics.get(stripeOffset);
    if (expected == null) {
        throw new OrcCorruptionException(orcDataSourceId, "Unexpected stripe at offset %s", stripeOffset);
    }
    validateColumnStatisticsEquivalent(orcDataSourceId, "Stripe at " + stripeOffset, actual, expected.getColumnStatistics());
}
Also used : StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics)

Example 2 with StripeStatistics

use of io.prestosql.orc.metadata.statistics.StripeStatistics in project hetu-core by openlookeng.

the class OrcWriteValidation method validateStripeStatistics.

public void validateStripeStatistics(OrcDataSourceId orcDataSourceId, List<StripeInformation> actualStripes, List<Optional<StripeStatistics>> actualStripeStatistics) throws OrcCorruptionException {
    requireNonNull(actualStripes, "actualStripes is null");
    requireNonNull(actualStripeStatistics, "actualStripeStatistics is null");
    if (actualStripeStatistics.size() != stripeStatistics.size()) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected number of columns in stripe statistics");
    }
    for (int stripeIndex = 0; stripeIndex < actualStripes.size(); stripeIndex++) {
        long stripeOffset = actualStripes.get(stripeIndex).getOffset();
        StripeStatistics actual = actualStripeStatistics.get(stripeIndex).get();
        validateStripeStatistics(orcDataSourceId, stripeOffset, actual.getColumnStatistics());
    }
}
Also used : StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics)

Example 3 with StripeStatistics

use of io.prestosql.orc.metadata.statistics.StripeStatistics in project hetu-core by openlookeng.

the class OrcWriter method bufferStripeData.

/**
 * Collect the data for for the stripe.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
    if (stripeRowCount == 0) {
        verify(flushReason == CLOSED, "An empty stripe is not allowed");
        // column writers must be closed or the reset call will fail
        columnWriters.forEach(ColumnWriter::close);
        return ImmutableList.of();
    }
    if (rowGroupRowCount > 0) {
        finishRowGroup();
    }
    // convert any dictionary encoded column with a low compression ratio to direct
    dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
    columnWriters.forEach(ColumnWriter::close);
    List<OrcDataOutput> outputData = new ArrayList<>();
    List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
    // get index streams
    long indexLength = 0;
    for (ColumnWriter columnWriter : columnWriters) {
        for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
            // The ordering is critical because the stream only contain a length with no offset.
            outputData.add(indexStream);
            allStreams.add(indexStream.getStream());
            indexLength += indexStream.size();
        }
    }
    // data streams (sorted by size)
    long dataLength = 0;
    List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
    for (ColumnWriter columnWriter : columnWriters) {
        List<StreamDataOutput> streams = columnWriter.getDataStreams();
        dataStreams.addAll(streams);
        dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
    }
    Collections.sort(dataStreams);
    // add data streams
    for (StreamDataOutput dataStream : dataStreams) {
        // The ordering is critical because the stream only contain a length with no offset.
        outputData.add(dataStream);
        allStreams.add(dataStream.getStream());
    }
    Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
    Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
    // the 0th column is a struct column for the whole row
    columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
    columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null));
    // add footer
    StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
    Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
    outputData.add(createDataOutput(footer));
    // create final stripe statistics
    StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
    recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
    StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
    ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
    closedStripes.add(closedStripe);
    closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
    recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
    stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
    return outputData;
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) ColumnWriters.createColumnWriter(io.prestosql.orc.writer.ColumnWriters.createColumnWriter) ColumnWriter(io.prestosql.orc.writer.ColumnWriter) SliceDictionaryColumnWriter(io.prestosql.orc.writer.SliceDictionaryColumnWriter) StreamDataOutput(io.prestosql.orc.stream.StreamDataOutput) OrcDataOutput(io.prestosql.orc.stream.OrcDataOutput) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) Stream(io.prestosql.orc.metadata.Stream) StripeInformation(io.prestosql.orc.metadata.StripeInformation)

Example 4 with StripeStatistics

use of io.prestosql.orc.metadata.statistics.StripeStatistics in project hetu-core by openlookeng.

the class OrcWriter method bufferFileFooter.

/**
 * Collect the data for for the file footer.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<OrcDataOutput> bufferFileFooter() throws IOException {
    if (preCloseCallback.isPresent()) {
        try {
            preCloseCallback.get().call();
        } catch (Exception e) {
            log.debug("Call pre close call back error");
        }
    }
    List<OrcDataOutput> outputData = new ArrayList<>();
    Metadata metadata = new Metadata(closedStripes.stream().map(ClosedStripe::getStatistics).map(Optional::of).collect(toList()));
    Slice metadataSlice = metadataWriter.writeMetadata(metadata);
    outputData.add(createDataOutput(metadataSlice));
    long numberOfRows = closedStripes.stream().mapToLong(stripe -> stripe.getStripeInformation().getNumberOfRows()).sum();
    Optional<ColumnMetadata<ColumnStatistics>> fileStats = toFileStats(closedStripes.stream().map(ClosedStripe::getStatistics).map(StripeStatistics::getColumnStatistics).collect(toList()));
    recordValidation(validation -> validation.setFileStatistics(fileStats));
    Map<String, Slice> localUserMetadata = this.userMetadata.entrySet().stream().collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue())));
    Footer footer = new Footer(numberOfRows, rowGroupMaxRowCount, closedStripes.stream().map(ClosedStripe::getStripeInformation).collect(toImmutableList()), orcTypes, fileStats, localUserMetadata);
    closedStripes.clear();
    closedStripesRetainedBytes = 0;
    Slice footerSlice = metadataWriter.writeFooter(footer);
    outputData.add(createDataOutput(footerSlice));
    recordValidation(validation -> validation.setVersion(metadataWriter.getOrcMetadataVersion()));
    Slice postscriptSlice = metadataWriter.writePostscript(footerSlice.length(), metadataSlice.length(), compression, maxCompressionBufferSize);
    outputData.add(createDataOutput(postscriptSlice));
    outputData.add(createDataOutput(Slices.wrappedBuffer(UnsignedBytes.checkedCast(postscriptSlice.length()))));
    return outputData;
}
Also used : Footer(io.prestosql.orc.metadata.Footer) CLOSED(io.prestosql.orc.OrcWriterStats.FlushReason.CLOSED) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Slices(io.airlift.slice.Slices) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Map(java.util.Map) ColumnWriters.createColumnWriter(io.prestosql.orc.writer.ColumnWriters.createColumnWriter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Type(io.prestosql.spi.type.Type) Metadata(io.prestosql.orc.metadata.Metadata) ImmutableSet(com.google.common.collect.ImmutableSet) OrcMetadataWriter(io.prestosql.orc.metadata.OrcMetadataWriter) OrcDataOutput(io.prestosql.orc.stream.OrcDataOutput) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) OrcWriteValidationMode(io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode) MAX_BYTES(io.prestosql.orc.OrcWriterStats.FlushReason.MAX_BYTES) StreamDataOutput(io.prestosql.orc.stream.StreamDataOutput) Collectors(java.util.stream.Collectors) ZoneId(java.time.ZoneId) Preconditions.checkState(com.google.common.base.Preconditions.checkState) StripeInformation(io.prestosql.orc.metadata.StripeInformation) List(java.util.List) ClassLayout(org.openjdk.jol.info.ClassLayout) CompressionKind(io.prestosql.orc.metadata.CompressionKind) Entry(java.util.Map.Entry) Optional(java.util.Optional) FlushReason(io.prestosql.orc.OrcWriterStats.FlushReason) OrcWriteValidationBuilder(io.prestosql.orc.OrcWriteValidation.OrcWriteValidationBuilder) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) MAX_ROWS(io.prestosql.orc.OrcWriterStats.FlushReason.MAX_ROWS) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnWriter(io.prestosql.orc.writer.ColumnWriter) ROOT_COLUMN(io.prestosql.orc.metadata.OrcColumnId.ROOT_COLUMN) Math.toIntExact(java.lang.Math.toIntExact) Nullable(javax.annotation.Nullable) Integer.min(java.lang.Integer.min) OrcReader.validateFile(io.prestosql.orc.OrcReader.validateFile) UnsignedBytes(com.google.common.primitives.UnsignedBytes) DICTIONARY_FULL(io.prestosql.orc.OrcWriterStats.FlushReason.DICTIONARY_FULL) SliceDictionaryColumnWriter(io.prestosql.orc.writer.SliceDictionaryColumnWriter) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) OrcType(io.prestosql.orc.metadata.OrcType) OrcDataOutput.createDataOutput(io.prestosql.orc.stream.OrcDataOutput.createDataOutput) Page(io.prestosql.spi.Page) IOException(java.io.IOException) DIRECT(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) Stream(io.prestosql.orc.metadata.Stream) Consumer(java.util.function.Consumer) Collectors.toList(java.util.stream.Collectors.toList) Closeable(java.io.Closeable) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) CompressedMetadataWriter(io.prestosql.orc.metadata.CompressedMetadataWriter) Collections(java.util.Collections) MAGIC(io.prestosql.orc.metadata.PostScript.MAGIC) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) Optional(java.util.Optional) ArrayList(java.util.ArrayList) Metadata(io.prestosql.orc.metadata.Metadata) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) OrcDataOutput(io.prestosql.orc.stream.OrcDataOutput) IOException(java.io.IOException) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) Footer(io.prestosql.orc.metadata.Footer) StripeFooter(io.prestosql.orc.metadata.StripeFooter)

Aggregations

StripeStatistics (io.prestosql.orc.metadata.statistics.StripeStatistics)4 Slice (io.airlift.slice.Slice)2 Slices.utf8Slice (io.airlift.slice.Slices.utf8Slice)2 ColumnEncoding (io.prestosql.orc.metadata.ColumnEncoding)2 OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)2 Stream (io.prestosql.orc.metadata.Stream)2 StripeFooter (io.prestosql.orc.metadata.StripeFooter)2 StripeInformation (io.prestosql.orc.metadata.StripeInformation)2 ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)2 OrcDataOutput (io.prestosql.orc.stream.OrcDataOutput)2 StreamDataOutput (io.prestosql.orc.stream.StreamDataOutput)2 ColumnWriter (io.prestosql.orc.writer.ColumnWriter)2 ColumnWriters.createColumnWriter (io.prestosql.orc.writer.ColumnWriters.createColumnWriter)2 SliceDictionaryColumnWriter (io.prestosql.orc.writer.SliceDictionaryColumnWriter)2 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 Verify.verify (com.google.common.base.Verify.verify)1 ImmutableList (com.google.common.collect.ImmutableList)1