Search in sources :

Example 1 with ValueInputStream

use of com.facebook.presto.orc.stream.ValueInputStream in project presto by prestodb.

the class StripeReader method readStripe.

public Stripe readStripe(StripeInformation stripe, OrcAggregatedMemoryContext systemMemoryUsage, Optional<DwrfEncryptionInfo> decryptors, SharedBuffer sharedDecompressionBuffer) throws IOException {
    StripeId stripeId = new StripeId(orcDataSource.getId(), stripe.getOffset());
    // read the stripe footer
    StripeFooter stripeFooter = readStripeFooter(stripeId, stripe, systemMemoryUsage);
    // get streams for selected columns
    List<List<Stream>> allStreams = new ArrayList<>();
    allStreams.add(stripeFooter.getStreams());
    Map<StreamId, Stream> includedStreams = new HashMap<>();
    boolean hasRowGroupDictionary = addIncludedStreams(stripeFooter.getColumnEncodings(), stripeFooter.getStreams(), includedStreams);
    Map<Integer, ColumnEncoding> columnEncodings = new HashMap<>();
    Map<Integer, ColumnEncoding> stripeFooterEncodings = stripeFooter.getColumnEncodings();
    columnEncodings.putAll(stripeFooterEncodings);
    // included columns may be encrypted
    if (decryptors.isPresent()) {
        List<Slice> encryptedEncryptionGroups = stripeFooter.getStripeEncryptionGroups();
        for (Integer groupId : decryptors.get().getEncryptorGroupIds()) {
            StripeEncryptionGroup stripeEncryptionGroup = getStripeEncryptionGroup(decryptors.get().getEncryptorByGroupId(groupId), encryptedEncryptionGroups.get(groupId), dwrfEncryptionGroupColumns.get(groupId), systemMemoryUsage);
            allStreams.add(stripeEncryptionGroup.getStreams());
            columnEncodings.putAll(stripeEncryptionGroup.getColumnEncodings());
            boolean encryptedHasRowGroupDictionary = addIncludedStreams(stripeEncryptionGroup.getColumnEncodings(), stripeEncryptionGroup.getStreams(), includedStreams);
            hasRowGroupDictionary = encryptedHasRowGroupDictionary || hasRowGroupDictionary;
        }
    }
    // handle stripes with more than one row group or a dictionary
    boolean invalidCheckPoint = false;
    if ((stripe.getNumberOfRows() > rowsInRowGroup) || hasRowGroupDictionary) {
        // determine ranges of the stripe to read
        Map<StreamId, DiskRange> diskRanges = getDiskRanges(allStreams);
        diskRanges = Maps.filterKeys(diskRanges, Predicates.in(includedStreams.keySet()));
        // read the file regions
        Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripeId, diskRanges, systemMemoryUsage, decryptors, sharedDecompressionBuffer);
        // read the row index for each column
        Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(includedStreams, streamsData, stripeId);
        if (writeValidation.isPresent()) {
            writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes);
        }
        // select the row groups matching the tuple domain
        Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
        // if all row groups are skipped, return null
        if (selectedRowGroups.isEmpty()) {
            // set accounted memory usage to zero
            systemMemoryUsage.close();
            return null;
        }
        // value streams
        Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(includedStreams, streamsData, columnEncodings);
        // build the dictionary streams
        InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(includedStreams, valueStreams, columnEncodings);
        // build the row groups
        try {
            List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), includedStreams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
            return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources);
        } catch (InvalidCheckpointException e) {
            // we must fail because the length of the row group dictionary is contained in the checkpoint stream.
            if (hasRowGroupDictionary) {
                throw new OrcCorruptionException(e, orcDataSource.getId(), "Checkpoints are corrupt");
            }
            invalidCheckPoint = true;
        }
    }
    // stripe only has one row group and no dictionary
    ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
    for (Entry<StreamId, DiskRange> entry : getDiskRanges(allStreams).entrySet()) {
        StreamId streamId = entry.getKey();
        if (includedStreams.keySet().contains(streamId)) {
            diskRangesBuilder.put(entry);
        }
    }
    ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.build();
    // read the file regions
    Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripeId, diskRanges, systemMemoryUsage, decryptors, sharedDecompressionBuffer);
    long totalBytes = 0;
    for (Entry<StreamId, Stream> entry : includedStreams.entrySet()) {
        if (entry.getKey().getStreamKind() == ROW_INDEX) {
            List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, streamsData.get(entry.getKey()), null);
            checkState(rowGroupIndexes.size() == 1 || invalidCheckPoint, "expect a single row group or an invalid check point");
            for (RowGroupIndex rowGroupIndex : rowGroupIndexes) {
                ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics();
                if (columnStatistics.hasMinAverageValueSizeInBytes()) {
                    totalBytes += columnStatistics.getTotalValueSizeInBytes();
                }
            }
        }
    }
    // value streams
    Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(includedStreams, streamsData, columnEncodings);
    // build the dictionary streams
    InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(includedStreams, valueStreams, columnEncodings);
    // build the row group
    ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
    for (Entry<StreamId, ValueInputStream<?>> entry : valueStreams.entrySet()) {
        builder.put(entry.getKey(), new ValueInputStreamSource<>(entry.getValue()));
    }
    RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), totalBytes, new InputStreamSources(builder.build()));
    return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
Also used : ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) InvalidCheckpointException(com.facebook.presto.orc.checkpoint.InvalidCheckpointException) InputStreamSource(com.facebook.presto.orc.stream.InputStreamSource) ValueInputStreamSource(com.facebook.presto.orc.stream.ValueInputStreamSource) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) Stream(com.facebook.presto.orc.metadata.Stream) InputStream(java.io.InputStream) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) ColumnStatistics.mergeColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) ImmutableMap(com.google.common.collect.ImmutableMap) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) InputStreamSources(com.facebook.presto.orc.stream.InputStreamSources) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) RowGroupIndex(com.facebook.presto.orc.metadata.RowGroupIndex) Slice(io.airlift.slice.Slice) StripeEncryptionGroup(com.facebook.presto.orc.metadata.StripeEncryptionGroup) DwrfMetadataReader.toStripeEncryptionGroup(com.facebook.presto.orc.metadata.DwrfMetadataReader.toStripeEncryptionGroup)

Example 2 with ValueInputStream

use of com.facebook.presto.orc.stream.ValueInputStream in project presto by prestodb.

the class StripeReader method createValueStreams.

private Map<StreamId, ValueInputStream<?>> createValueStreams(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, Map<Integer, ColumnEncoding> columnEncodings) {
    ImmutableMap.Builder<StreamId, ValueInputStream<?>> valueStreams = ImmutableMap.builder();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        StreamId streamId = entry.getKey();
        Stream stream = entry.getValue();
        ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncoding(stream.getSequence()).getColumnEncodingKind();
        // skip index and empty streams
        if (isIndexStream(stream) || stream.getLength() == 0) {
            continue;
        }
        OrcInputStream inputStream = streamsData.get(streamId);
        OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind();
        valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts()));
    }
    return valueStreams.build();
}
Also used : ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) Stream(com.facebook.presto.orc.metadata.Stream) InputStream(java.io.InputStream) OrcTypeKind(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind) ImmutableMap(com.google.common.collect.ImmutableMap) ColumnEncodingKind(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind)

Example 3 with ValueInputStream

use of com.facebook.presto.orc.stream.ValueInputStream in project presto by prestodb.

the class StripeReader method createDictionaryStreamSources.

public InputStreamSources createDictionaryStreamSources(Map<StreamId, Stream> streams, Map<StreamId, ValueInputStream<?>> valueStreams, Map<Integer, ColumnEncoding> columnEncodings) {
    ImmutableMap.Builder<StreamId, InputStreamSource<?>> dictionaryStreamBuilder = ImmutableMap.builder();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        StreamId streamId = entry.getKey();
        Stream stream = entry.getValue();
        int column = stream.getColumn();
        // only process dictionary streams
        ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncoding(stream.getSequence()).getColumnEncodingKind();
        if (!isDictionary(stream, columnEncoding)) {
            continue;
        }
        // skip streams without data
        ValueInputStream<?> valueStream = valueStreams.get(streamId);
        if (valueStream == null) {
            continue;
        }
        OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind();
        StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding);
        InputStreamSource<?> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint);
        dictionaryStreamBuilder.put(streamId, streamSource);
    }
    return new InputStreamSources(dictionaryStreamBuilder.build());
}
Also used : OrcTypeKind(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind) ImmutableMap(com.google.common.collect.ImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) InputStreamSource(com.facebook.presto.orc.stream.InputStreamSource) ValueInputStreamSource(com.facebook.presto.orc.stream.ValueInputStreamSource) InputStreamSources(com.facebook.presto.orc.stream.InputStreamSources) ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) Stream(com.facebook.presto.orc.metadata.Stream) InputStream(java.io.InputStream) ColumnEncodingKind(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint)

Example 4 with ValueInputStream

use of com.facebook.presto.orc.stream.ValueInputStream in project presto by prestodb.

the class StripeReader method createRowGroup.

@VisibleForTesting
static RowGroup createRowGroup(int groupId, long rowsInStripe, long rowsInRowGroup, Map<StreamId, List<RowGroupIndex>> columnIndexes, Map<StreamId, ValueInputStream<?>> valueStreams, Map<StreamId, StreamCheckpoint> checkpoints) {
    long totalRowGroupBytes = columnIndexes.values().stream().mapToLong(e -> e.get(groupId).getColumnStatistics().getTotalValueSizeInBytes()).sum();
    long rowOffset = multiplyExact(groupId, rowsInRowGroup);
    int rowCount = toIntExact(Math.min(rowsInStripe - rowOffset, rowsInRowGroup));
    ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
    for (Entry<StreamId, StreamCheckpoint> entry : checkpoints.entrySet()) {
        StreamId streamId = entry.getKey();
        StreamCheckpoint checkpoint = entry.getValue();
        // skip streams without data
        ValueInputStream<?> valueStream = valueStreams.get(streamId);
        if (valueStream == null) {
            continue;
        }
        builder.put(streamId, createCheckpointStreamSource(valueStream, checkpoint));
    }
    InputStreamSources rowGroupStreams = new InputStreamSources(builder.build());
    return new RowGroup(groupId, rowOffset, rowCount, totalRowGroupBytes, rowGroupStreams);
}
Also used : StripeEncryptionGroup(com.facebook.presto.orc.metadata.StripeEncryptionGroup) OrcTypeKind(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) INDEX(com.facebook.presto.orc.metadata.Stream.StreamArea.INDEX) RowGroupIndex(com.facebook.presto.orc.metadata.RowGroupIndex) Map(java.util.Map) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation) RuntimeStats(com.facebook.presto.common.RuntimeStats) StreamKind(com.facebook.presto.orc.metadata.Stream.StreamKind) InvalidCheckpointException(com.facebook.presto.orc.checkpoint.InvalidCheckpointException) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) InputStreamSource(com.facebook.presto.orc.stream.InputStreamSource) Collection(java.util.Collection) Set(java.util.Set) CheckpointInputStreamSource.createCheckpointStreamSource(com.facebook.presto.orc.stream.CheckpointInputStreamSource.createCheckpointStreamSource) ValueInputStreamSource(com.facebook.presto.orc.stream.ValueInputStreamSource) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) List(java.util.List) Entry(java.util.Map.Entry) InputStreamSources(com.facebook.presto.orc.stream.InputStreamSources) Optional(java.util.Optional) ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) DICTIONARY(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY) DwrfSequenceEncoding(com.facebook.presto.orc.metadata.DwrfSequenceEncoding) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) SortedMap(java.util.SortedMap) DICTIONARY_DATA(com.facebook.presto.orc.metadata.Stream.StreamKind.DICTIONARY_DATA) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) ROW_INDEX(com.facebook.presto.orc.metadata.Stream.StreamKind.ROW_INDEX) Slice(io.airlift.slice.Slice) LENGTH(com.facebook.presto.orc.metadata.Stream.StreamKind.LENGTH) HashMap(java.util.HashMap) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) DwrfMetadataReader.toStripeEncryptionGroup(com.facebook.presto.orc.metadata.DwrfMetadataReader.toStripeEncryptionGroup) Multimap(com.google.common.collect.Multimap) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) HiveWriterVersion(com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicates(com.google.common.base.Predicates) OrcType(com.facebook.presto.orc.metadata.OrcType) Math.toIntExact(java.lang.Math.toIntExact) ImmutableMultimap(com.google.common.collect.ImmutableMultimap) ColumnStatistics.mergeColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics) BLOOM_FILTER(com.facebook.presto.orc.metadata.Stream.StreamKind.BLOOM_FILTER) NOOP_ORC_LOCAL_MEMORY_CONTEXT(com.facebook.presto.orc.NoopOrcLocalMemoryContext.NOOP_ORC_LOCAL_MEMORY_CONTEXT) SharedBuffer(com.facebook.presto.orc.stream.SharedBuffer) ColumnEncodingKind(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind) DICTIONARY_V2(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2) STRUCT(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind.STRUCT) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) IOException(java.io.IOException) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) Maps(com.google.common.collect.Maps) Stream(com.facebook.presto.orc.metadata.Stream) Math.multiplyExact(java.lang.Math.multiplyExact) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) Checkpoints.getStreamCheckpoints(com.facebook.presto.orc.checkpoint.Checkpoints.getStreamCheckpoints) ValueStreams(com.facebook.presto.orc.stream.ValueStreams) VisibleForTesting(com.google.common.annotations.VisibleForTesting) MetadataReader(com.facebook.presto.orc.metadata.MetadataReader) InputStream(java.io.InputStream) InputStreamSource(com.facebook.presto.orc.stream.InputStreamSource) ValueInputStreamSource(com.facebook.presto.orc.stream.ValueInputStreamSource) InputStreamSources(com.facebook.presto.orc.stream.InputStreamSources) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) ImmutableMap(com.google.common.collect.ImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

Stream (com.facebook.presto.orc.metadata.Stream)4 OrcInputStream (com.facebook.presto.orc.stream.OrcInputStream)4 ValueInputStream (com.facebook.presto.orc.stream.ValueInputStream)4 ImmutableMap (com.google.common.collect.ImmutableMap)4 InputStream (java.io.InputStream)4 ColumnEncodingKind (com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind)3 OrcTypeKind (com.facebook.presto.orc.metadata.OrcType.OrcTypeKind)3 InputStreamSource (com.facebook.presto.orc.stream.InputStreamSource)3 InputStreamSources (com.facebook.presto.orc.stream.InputStreamSources)3 ValueInputStreamSource (com.facebook.presto.orc.stream.ValueInputStreamSource)3 Checkpoints.getDictionaryStreamCheckpoint (com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint)2 InvalidCheckpointException (com.facebook.presto.orc.checkpoint.InvalidCheckpointException)2 StreamCheckpoint (com.facebook.presto.orc.checkpoint.StreamCheckpoint)2 ColumnEncoding (com.facebook.presto.orc.metadata.ColumnEncoding)2 DwrfMetadataReader.toStripeEncryptionGroup (com.facebook.presto.orc.metadata.DwrfMetadataReader.toStripeEncryptionGroup)2 RowGroupIndex (com.facebook.presto.orc.metadata.RowGroupIndex)2 StripeEncryptionGroup (com.facebook.presto.orc.metadata.StripeEncryptionGroup)2 StripeFooter (com.facebook.presto.orc.metadata.StripeFooter)2 ColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics)2 ColumnStatistics.mergeColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics)2