Search in sources :

Example 1 with StreamSource

use of com.facebook.presto.orc.stream.StreamSource in project presto by prestodb.

the class StripeReader method createDictionaryStreamSources.

public StreamSources createDictionaryStreamSources(Map<StreamId, Stream> streams, Map<StreamId, ValueStream<?>> valueStreams, List<ColumnEncoding> columnEncodings) {
    ImmutableMap.Builder<StreamId, StreamSource<?>> dictionaryStreamBuilder = ImmutableMap.builder();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        StreamId streamId = entry.getKey();
        Stream stream = entry.getValue();
        int column = stream.getColumn();
        // only process dictionary streams
        ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind();
        if (!isDictionary(stream, columnEncoding)) {
            continue;
        }
        // skip streams without data
        ValueStream<?> valueStream = valueStreams.get(streamId);
        if (valueStream == null) {
            continue;
        }
        OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind();
        StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding);
        StreamSource<?> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint);
        dictionaryStreamBuilder.put(streamId, streamSource);
    }
    return new StreamSources(dictionaryStreamBuilder.build());
}
Also used : StreamSource(com.facebook.presto.orc.stream.StreamSource) CheckpointStreamSource.createCheckpointStreamSource(com.facebook.presto.orc.stream.CheckpointStreamSource.createCheckpointStreamSource) ValueStreamSource(com.facebook.presto.orc.stream.ValueStreamSource) OrcTypeKind(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind) ImmutableMap(com.google.common.collect.ImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) StreamSources(com.facebook.presto.orc.stream.StreamSources) ValueStream(com.facebook.presto.orc.stream.ValueStream) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) Stream(com.facebook.presto.orc.metadata.Stream) InputStream(java.io.InputStream) ColumnEncodingKind(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint)

Example 2 with StreamSource

use of com.facebook.presto.orc.stream.StreamSource in project presto by prestodb.

the class StripeReader method readStripe.

public Stripe readStripe(StripeInformation stripe, AggregatedMemoryContext systemMemoryUsage) throws IOException {
    // read the stripe footer
    StripeFooter stripeFooter = readStripeFooter(stripe, systemMemoryUsage);
    List<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings();
    // get streams for selected columns
    Map<StreamId, Stream> streams = new HashMap<>();
    boolean hasRowGroupDictionary = false;
    for (Stream stream : stripeFooter.getStreams()) {
        if (includedOrcColumns.contains(stream.getColumn())) {
            streams.put(new StreamId(stream), stream);
            ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind();
            if (columnEncoding == DICTIONARY && stream.getStreamKind() == StreamKind.IN_DICTIONARY) {
                hasRowGroupDictionary = true;
            }
        }
    }
    // handle stripes with more than one row group or a dictionary
    if ((stripe.getNumberOfRows() > rowsInRowGroup) || hasRowGroupDictionary) {
        // determine ranges of the stripe to read
        Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams());
        diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet()));
        // read the file regions
        Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
        // read the bloom filter for each column
        Map<Integer, List<HiveBloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData);
        // read the row index for each column
        Map<Integer, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes);
        // select the row groups matching the tuple domain
        Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
        // if all row groups are skipped, return null
        if (selectedRowGroups.isEmpty()) {
            // set accounted memory usage to zero
            systemMemoryUsage.close();
            return null;
        }
        // value streams
        Map<StreamId, ValueStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
        // build the dictionary streams
        StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
        // build the row groups
        try {
            List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), streams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
            return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources);
        } catch (InvalidCheckpointException e) {
            // we must fail because the length of the row group dictionary is contained in the checkpoint stream.
            if (hasRowGroupDictionary) {
                throw new OrcCorruptionException(e, "ORC file %s has corrupt checkpoints", orcDataSource);
            }
        }
    }
    // stripe only has one row group and no dictionary
    ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
    for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) {
        StreamId streamId = entry.getKey();
        if (streamId.getStreamKind() != ROW_INDEX && streams.keySet().contains(streamId)) {
            diskRangesBuilder.put(entry);
        }
    }
    ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.build();
    // read the file regions
    Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
    // value streams
    Map<StreamId, ValueStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
    // build the dictionary streams
    StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
    // build the row group
    ImmutableMap.Builder<StreamId, StreamSource<?>> builder = ImmutableMap.builder();
    for (Entry<StreamId, ValueStream<?>> entry : valueStreams.entrySet()) {
        builder.put(entry.getKey(), new ValueStreamSource<>(entry.getValue()));
    }
    RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), new StreamSources(builder.build()));
    return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
Also used : HashMap(java.util.HashMap) InvalidCheckpointException(com.facebook.presto.orc.checkpoint.InvalidCheckpointException) ValueStream(com.facebook.presto.orc.stream.ValueStream) ValueStream(com.facebook.presto.orc.stream.ValueStream) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) Stream(com.facebook.presto.orc.metadata.Stream) InputStream(java.io.InputStream) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ColumnEncodingKind(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) StreamSource(com.facebook.presto.orc.stream.StreamSource) CheckpointStreamSource.createCheckpointStreamSource(com.facebook.presto.orc.stream.CheckpointStreamSource.createCheckpointStreamSource) ValueStreamSource(com.facebook.presto.orc.stream.ValueStreamSource) ImmutableMap(com.google.common.collect.ImmutableMap) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) StreamSources(com.facebook.presto.orc.stream.StreamSources)

Example 3 with StreamSource

use of com.facebook.presto.orc.stream.StreamSource in project presto by prestodb.

the class StripeReader method createRowGroup.

public static RowGroup createRowGroup(int groupId, int rowOffset, int rowCount, Map<StreamId, ValueStream<?>> valueStreams, Map<StreamId, StreamCheckpoint> checkpoints) {
    ImmutableMap.Builder<StreamId, StreamSource<?>> builder = ImmutableMap.builder();
    for (Entry<StreamId, StreamCheckpoint> entry : checkpoints.entrySet()) {
        StreamId streamId = entry.getKey();
        StreamCheckpoint checkpoint = entry.getValue();
        // skip streams without data
        ValueStream<?> valueStream = valueStreams.get(streamId);
        if (valueStream == null) {
            continue;
        }
        builder.put(streamId, createCheckpointStreamSource(valueStream, checkpoint));
    }
    StreamSources rowGroupStreams = new StreamSources(builder.build());
    return new RowGroup(groupId, rowOffset, rowCount, rowGroupStreams);
}
Also used : StreamSources(com.facebook.presto.orc.stream.StreamSources) StreamSource(com.facebook.presto.orc.stream.StreamSource) CheckpointStreamSource.createCheckpointStreamSource(com.facebook.presto.orc.stream.CheckpointStreamSource.createCheckpointStreamSource) ValueStreamSource(com.facebook.presto.orc.stream.ValueStreamSource) ImmutableMap(com.google.common.collect.ImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint)

Aggregations

CheckpointStreamSource.createCheckpointStreamSource (com.facebook.presto.orc.stream.CheckpointStreamSource.createCheckpointStreamSource)3 StreamSource (com.facebook.presto.orc.stream.StreamSource)3 StreamSources (com.facebook.presto.orc.stream.StreamSources)3 ValueStreamSource (com.facebook.presto.orc.stream.ValueStreamSource)3 ImmutableMap (com.google.common.collect.ImmutableMap)3 Checkpoints.getDictionaryStreamCheckpoint (com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint)2 StreamCheckpoint (com.facebook.presto.orc.checkpoint.StreamCheckpoint)2 ColumnEncodingKind (com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind)2 Stream (com.facebook.presto.orc.metadata.Stream)2 OrcInputStream (com.facebook.presto.orc.stream.OrcInputStream)2 ValueStream (com.facebook.presto.orc.stream.ValueStream)2 InputStream (java.io.InputStream)2 InvalidCheckpointException (com.facebook.presto.orc.checkpoint.InvalidCheckpointException)1 ColumnEncoding (com.facebook.presto.orc.metadata.ColumnEncoding)1 OrcTypeKind (com.facebook.presto.orc.metadata.OrcType.OrcTypeKind)1 StripeFooter (com.facebook.presto.orc.metadata.StripeFooter)1 ImmutableList (com.google.common.collect.ImmutableList)1 HashMap (java.util.HashMap)1 List (java.util.List)1