Search in sources :

Example 11 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class ListColumnWriter method finishRowGroup.

@Override
public Map<OrcColumnId, ColumnStatistics> finishRowGroup() {
    checkState(!closed);
    ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null);
    rowGroupColumnStatistics.add(statistics);
    nonNullValueCount = 0;
    ImmutableMap.Builder<OrcColumnId, ColumnStatistics> columnStatistics = ImmutableMap.builder();
    columnStatistics.put(columnId, statistics);
    columnStatistics.putAll(elementWriter.finishRowGroup());
    return columnStatistics.build();
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 12 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class OrcWriteValidation method validateRowGroupStatistics.

public void validateRowGroupStatistics(OrcDataSourceId orcDataSourceId, long stripeOffset, Map<StreamId, List<RowGroupIndex>> actualRowGroupStatistics) throws OrcCorruptionException {
    requireNonNull(actualRowGroupStatistics, "actualRowGroupStatistics is null");
    List<RowGroupStatistics> expectedRowGroupStatistics = rowGroupStatistics.get(stripeOffset);
    if (expectedRowGroupStatistics == null) {
        throw new OrcCorruptionException(orcDataSourceId, "Unexpected stripe at offset %s", stripeOffset);
    }
    int rowGroupCount = expectedRowGroupStatistics.size();
    for (Entry<StreamId, List<RowGroupIndex>> entry : actualRowGroupStatistics.entrySet()) {
        if (entry.getValue().size() != rowGroupCount) {
            throw new OrcCorruptionException(orcDataSourceId, "Unexpected row group count stripe in at offset %s", stripeOffset);
        }
    }
    for (int rowGroupIndex = 0; rowGroupIndex < expectedRowGroupStatistics.size(); rowGroupIndex++) {
        RowGroupStatistics expectedRowGroup = expectedRowGroupStatistics.get(rowGroupIndex);
        if (expectedRowGroup.getValidationMode() != HASHED) {
            Map<OrcColumnId, ColumnStatistics> expectedStatistics = expectedRowGroup.getColumnStatistics();
            Set<OrcColumnId> actualColumns = actualRowGroupStatistics.keySet().stream().map(StreamId::getColumnId).collect(Collectors.toSet());
            if (!expectedStatistics.keySet().equals(actualColumns)) {
                throw new OrcCorruptionException(orcDataSourceId, "Unexpected column in row group %s in stripe at offset %s", rowGroupIndex, stripeOffset);
            }
            for (Entry<StreamId, List<RowGroupIndex>> entry : actualRowGroupStatistics.entrySet()) {
                ColumnStatistics actual = entry.getValue().get(rowGroupIndex).getColumnStatistics();
                ColumnStatistics expected = expectedStatistics.get(entry.getKey().getColumnId());
                validateColumnStatisticsEquivalent(orcDataSourceId, "Row group " + rowGroupIndex + " in stripe at offset " + stripeOffset, actual, expected);
            }
        }
        if (expectedRowGroup.getValidationMode() != DETAILED) {
            RowGroupStatistics actualRowGroup = buildActualRowGroupStatistics(rowGroupIndex, actualRowGroupStatistics);
            if (expectedRowGroup.getHash() != actualRowGroup.getHash()) {
                throw new OrcCorruptionException(orcDataSourceId, "Checksum mismatch for row group %s in stripe at offset %s", rowGroupIndex, stripeOffset);
            }
        }
    }
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList)

Example 13 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class OrcWriteValidation method validateColumnStatisticsEquivalent.

private static void validateColumnStatisticsEquivalent(OrcDataSourceId orcDataSourceId, String name, ColumnMetadata<ColumnStatistics> actualColumnStatistics, ColumnMetadata<ColumnStatistics> expectedColumnStatistics) throws OrcCorruptionException {
    requireNonNull(name, "name is null");
    requireNonNull(actualColumnStatistics, "actualColumnStatistics is null");
    requireNonNull(expectedColumnStatistics, "expectedColumnStatistics is null");
    if (actualColumnStatistics.size() != expectedColumnStatistics.size()) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected number of columns in %s statistics", name);
    }
    for (int i = 0; i < actualColumnStatistics.size(); i++) {
        OrcColumnId columnId = new OrcColumnId(i);
        ColumnStatistics actual = actualColumnStatistics.get(columnId);
        ColumnStatistics expected = expectedColumnStatistics.get(columnId);
        validateColumnStatisticsEquivalent(orcDataSourceId, name + " column " + i, actual, expected);
    }
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId)

Example 14 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class OrcWriter method toFileStats.

private static Optional<ColumnMetadata<ColumnStatistics>> toFileStats(List<ColumnMetadata<ColumnStatistics>> stripes) {
    if (stripes.isEmpty()) {
        return Optional.empty();
    }
    int columnCount = stripes.get(0).size();
    checkArgument(stripes.stream().allMatch(stripe -> columnCount == stripe.size()));
    ImmutableList.Builder<ColumnStatistics> fileStats = ImmutableList.builder();
    for (int i = 0; i < columnCount; i++) {
        OrcColumnId columnId = new OrcColumnId(i);
        fileStats.add(ColumnStatistics.mergeColumnStatistics(stripes.stream().map(stripe -> stripe.get(columnId)).collect(toList())));
    }
    return Optional.of(new ColumnMetadata<>(fileStats.build()));
}
Also used : Footer(io.prestosql.orc.metadata.Footer) CLOSED(io.prestosql.orc.OrcWriterStats.FlushReason.CLOSED) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Slices(io.airlift.slice.Slices) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Map(java.util.Map) ColumnWriters.createColumnWriter(io.prestosql.orc.writer.ColumnWriters.createColumnWriter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Type(io.prestosql.spi.type.Type) Metadata(io.prestosql.orc.metadata.Metadata) ImmutableSet(com.google.common.collect.ImmutableSet) OrcMetadataWriter(io.prestosql.orc.metadata.OrcMetadataWriter) OrcDataOutput(io.prestosql.orc.stream.OrcDataOutput) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) OrcWriteValidationMode(io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode) MAX_BYTES(io.prestosql.orc.OrcWriterStats.FlushReason.MAX_BYTES) StreamDataOutput(io.prestosql.orc.stream.StreamDataOutput) Collectors(java.util.stream.Collectors) ZoneId(java.time.ZoneId) Preconditions.checkState(com.google.common.base.Preconditions.checkState) StripeInformation(io.prestosql.orc.metadata.StripeInformation) List(java.util.List) ClassLayout(org.openjdk.jol.info.ClassLayout) CompressionKind(io.prestosql.orc.metadata.CompressionKind) Entry(java.util.Map.Entry) Optional(java.util.Optional) FlushReason(io.prestosql.orc.OrcWriterStats.FlushReason) OrcWriteValidationBuilder(io.prestosql.orc.OrcWriteValidation.OrcWriteValidationBuilder) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) MAX_ROWS(io.prestosql.orc.OrcWriterStats.FlushReason.MAX_ROWS) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnWriter(io.prestosql.orc.writer.ColumnWriter) ROOT_COLUMN(io.prestosql.orc.metadata.OrcColumnId.ROOT_COLUMN) Math.toIntExact(java.lang.Math.toIntExact) Nullable(javax.annotation.Nullable) Integer.min(java.lang.Integer.min) OrcReader.validateFile(io.prestosql.orc.OrcReader.validateFile) UnsignedBytes(com.google.common.primitives.UnsignedBytes) DICTIONARY_FULL(io.prestosql.orc.OrcWriterStats.FlushReason.DICTIONARY_FULL) SliceDictionaryColumnWriter(io.prestosql.orc.writer.SliceDictionaryColumnWriter) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) OrcType(io.prestosql.orc.metadata.OrcType) OrcDataOutput.createDataOutput(io.prestosql.orc.stream.OrcDataOutput.createDataOutput) Page(io.prestosql.spi.Page) IOException(java.io.IOException) DIRECT(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) Stream(io.prestosql.orc.metadata.Stream) Consumer(java.util.function.Consumer) Collectors.toList(java.util.stream.Collectors.toList) Closeable(java.io.Closeable) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) CompressedMetadataWriter(io.prestosql.orc.metadata.CompressedMetadataWriter) Collections(java.util.Collections) MAGIC(io.prestosql.orc.metadata.PostScript.MAGIC) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList)

Example 15 with OrcColumnId

use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.

the class OrcWriter method bufferStripeData.

/**
 * Collect the data for for the stripe.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
    if (stripeRowCount == 0) {
        verify(flushReason == CLOSED, "An empty stripe is not allowed");
        // column writers must be closed or the reset call will fail
        columnWriters.forEach(ColumnWriter::close);
        return ImmutableList.of();
    }
    if (rowGroupRowCount > 0) {
        finishRowGroup();
    }
    // convert any dictionary encoded column with a low compression ratio to direct
    dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
    columnWriters.forEach(ColumnWriter::close);
    List<OrcDataOutput> outputData = new ArrayList<>();
    List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
    // get index streams
    long indexLength = 0;
    for (ColumnWriter columnWriter : columnWriters) {
        for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
            // The ordering is critical because the stream only contain a length with no offset.
            outputData.add(indexStream);
            allStreams.add(indexStream.getStream());
            indexLength += indexStream.size();
        }
    }
    // data streams (sorted by size)
    long dataLength = 0;
    List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
    for (ColumnWriter columnWriter : columnWriters) {
        List<StreamDataOutput> streams = columnWriter.getDataStreams();
        dataStreams.addAll(streams);
        dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
    }
    Collections.sort(dataStreams);
    // add data streams
    for (StreamDataOutput dataStream : dataStreams) {
        // The ordering is critical because the stream only contain a length with no offset.
        outputData.add(dataStream);
        allStreams.add(dataStream.getStream());
    }
    Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
    Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
    // the 0th column is a struct column for the whole row
    columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
    columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null));
    // add footer
    StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
    Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
    outputData.add(createDataOutput(footer));
    // create final stripe statistics
    StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
    recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
    StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
    ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
    closedStripes.add(closedStripe);
    closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
    recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
    stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
    return outputData;
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) ColumnWriters.createColumnWriter(io.prestosql.orc.writer.ColumnWriters.createColumnWriter) ColumnWriter(io.prestosql.orc.writer.ColumnWriter) SliceDictionaryColumnWriter(io.prestosql.orc.writer.SliceDictionaryColumnWriter) StreamDataOutput(io.prestosql.orc.stream.StreamDataOutput) OrcDataOutput(io.prestosql.orc.stream.OrcDataOutput) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) Stream(io.prestosql.orc.metadata.Stream) StripeInformation(io.prestosql.orc.metadata.StripeInformation)

Aggregations

OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)23 Stream (io.prestosql.orc.metadata.Stream)9 ImmutableMap (com.google.common.collect.ImmutableMap)8 ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)7 Test (org.testng.annotations.Test)7 ArrayList (java.util.ArrayList)6 List (java.util.List)6 Slice (io.airlift.slice.Slice)5 ImmutableList (com.google.common.collect.ImmutableList)4 CompressionKind (io.prestosql.orc.metadata.CompressionKind)4 ColumnReader (io.prestosql.orc.reader.ColumnReader)4 DateColumnReader (io.prestosql.orc.reader.DateColumnReader)4 IntegerColumnReader (io.prestosql.orc.reader.IntegerColumnReader)4 LongColumnReader (io.prestosql.orc.reader.LongColumnReader)4 ShortColumnReader (io.prestosql.orc.reader.ShortColumnReader)4 OrcInputStream (io.prestosql.orc.stream.OrcInputStream)4 InputStream (java.io.InputStream)4 Map (java.util.Map)4 ColumnEncoding (io.prestosql.orc.metadata.ColumnEncoding)3 StripeFooter (io.prestosql.orc.metadata.StripeFooter)3