Search in sources :

Example 1 with Stream

use of io.prestosql.orc.metadata.Stream in project hetu-core by openlookeng.

the class StripeReader method getDiskRanges.

private static Map<StreamId, DiskRange> getDiskRanges(List<Stream> streams) {
    ImmutableMap.Builder<StreamId, DiskRange> streamDiskRanges = ImmutableMap.builder();
    long stripeOffset = 0;
    for (Stream stream : streams) {
        int streamLength = toIntExact(stream.getLength());
        // ignore zero byte streams
        if (streamLength > 0) {
            streamDiskRanges.put(new StreamId(stream), new DiskRange(stripeOffset, streamLength));
        }
        stripeOffset += streamLength;
    }
    return streamDiskRanges.build();
}
Also used : OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) Stream(io.prestosql.orc.metadata.Stream) InputStream(java.io.InputStream) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint)

Example 2 with Stream

use of io.prestosql.orc.metadata.Stream in project hetu-core by openlookeng.

the class StripeReader method readColumnIndexes.

private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData, Map<OrcColumnId, List<HashableBloomFilter>> bloomFilterIndexes, StripeInformation stripe) throws IOException {
    ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == ROW_INDEX) {
            OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
            List<HashableBloomFilter> bloomFilters = bloomFilterIndexes.get(entry.getKey().getColumnId());
            List<RowGroupIndex> rowGroupIndexes;
            if (orcCacheProperties.isRowIndexCacheEnabled()) {
                OrcRowIndexCacheKey indexCacheKey = new OrcRowIndexCacheKey();
                indexCacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
                indexCacheKey.setStripeOffset(stripe.getOffset());
                indexCacheKey.setStreamId(entry.getKey());
                try {
                    rowGroupIndexes = orcCacheStore.getRowIndexCache().get(indexCacheKey, () -> metadataReader.readRowIndexes(hiveWriterVersion, inputStream));
                } catch (UncheckedExecutionException | ExecutionException executionException) {
                    handleCacheLoadException(executionException);
                    log.debug(executionException.getCause(), "Error while caching row group indexes. Falling back to default flow");
                    rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
                }
            } else {
                rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
            }
            if (bloomFilters != null && !bloomFilters.isEmpty()) {
                ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder();
                for (int i = 0; i < rowGroupIndexes.size(); i++) {
                    RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i);
                    ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics().withBloomFilter(bloomFilters.get(i));
                    newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics));
                }
                rowGroupIndexes = newRowGroupIndexes.build();
            }
            columnIndexes.put(entry.getKey(), rowGroupIndexes);
        }
    }
    return columnIndexes.build();
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ImmutableList(com.google.common.collect.ImmutableList) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) Stream(io.prestosql.orc.metadata.Stream) InputStream(java.io.InputStream) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ExecutionException(java.util.concurrent.ExecutionException)

Example 3 with Stream

use of io.prestosql.orc.metadata.Stream in project hetu-core by openlookeng.

the class StripeReader method readBloomFilterIndexes.

private Map<OrcColumnId, List<HashableBloomFilter>> readBloomFilterIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData, StripeInformation stripe) throws IOException {
    HashMap<OrcColumnId, List<HashableBloomFilter>> bloomFilters = new HashMap<>();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == BLOOM_FILTER_UTF8) {
            OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
            if (orcCacheProperties.isBloomFilterCacheEnabled()) {
                OrcBloomFilterCacheKey bloomFilterCacheKey = new OrcBloomFilterCacheKey();
                bloomFilterCacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
                bloomFilterCacheKey.setStripeOffset(stripe.getOffset());
                bloomFilterCacheKey.setStreamId(entry.getKey());
                try {
                    bloomFilters.put(stream.getColumnId(), orcCacheStore.getBloomFiltersCache().get(bloomFilterCacheKey, () -> metadataReader.readBloomFilterIndexes(inputStream)));
                } catch (UncheckedExecutionException | ExecutionException executionException) {
                    handleCacheLoadException(executionException);
                    log.debug(executionException.getCause(), "Error while caching bloom filters. Falling back to default flow");
                    bloomFilters.put(stream.getColumnId(), metadataReader.readBloomFilterIndexes(inputStream));
                }
            } else {
                bloomFilters.put(stream.getColumnId(), metadataReader.readBloomFilterIndexes(inputStream));
            }
        }
    }
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == BLOOM_FILTER && !bloomFilters.containsKey(stream.getColumnId())) {
            OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
            if (orcCacheProperties.isBloomFilterCacheEnabled()) {
                OrcBloomFilterCacheKey bloomFilterCacheKey = new OrcBloomFilterCacheKey();
                bloomFilterCacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
                bloomFilterCacheKey.setStripeOffset(stripe.getOffset());
                bloomFilterCacheKey.setStreamId(entry.getKey());
                try {
                    bloomFilters.put(entry.getKey().getColumnId(), orcCacheStore.getBloomFiltersCache().get(bloomFilterCacheKey, () -> metadataReader.readBloomFilterIndexes(inputStream)));
                } catch (UncheckedExecutionException | ExecutionException executionException) {
                    handleCacheLoadException(executionException);
                    log.debug(executionException.getCause(), "Error while caching bloom filters. Falling back to default flow");
                    bloomFilters.put(entry.getKey().getColumnId(), metadataReader.readBloomFilterIndexes(inputStream));
                }
            } else {
                bloomFilters.put(entry.getKey().getColumnId(), metadataReader.readBloomFilterIndexes(inputStream));
            }
        }
    }
    return ImmutableMap.copyOf(bloomFilters);
}
Also used : OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) HashMap(java.util.HashMap) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) Stream(io.prestosql.orc.metadata.Stream) InputStream(java.io.InputStream) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ExecutionException(java.util.concurrent.ExecutionException)

Example 4 with Stream

use of io.prestosql.orc.metadata.Stream in project hetu-core by openlookeng.

the class StripeReader method getRowGroupStatistics.

private static ColumnMetadata<ColumnStatistics> getRowGroupStatistics(ColumnMetadata<OrcType> types, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) {
    requireNonNull(columnIndexes, "columnIndexes is null");
    checkArgument(rowGroup >= 0, "rowGroup is negative");
    Map<Integer, List<RowGroupIndex>> rowGroupIndexesByColumn = columnIndexes.entrySet().stream().collect(toImmutableMap(entry -> entry.getKey().getColumnId().getId(), Entry::getValue));
    List<ColumnStatistics> statistics = new ArrayList<>(types.size());
    for (int columnIndex = 0; columnIndex < types.size(); columnIndex++) {
        List<RowGroupIndex> rowGroupIndexes = rowGroupIndexesByColumn.get(columnIndex);
        if (rowGroupIndexes != null) {
            statistics.add(rowGroupIndexes.get(rowGroup).getColumnStatistics());
        } else {
            statistics.add(null);
        }
    }
    return new ColumnMetadata<>(statistics);
}
Also used : CheckpointInputStreamSource.createCheckpointStreamSource(io.prestosql.orc.stream.CheckpointInputStreamSource.createCheckpointStreamSource) OrcDataReader(io.prestosql.orc.stream.OrcDataReader) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ValueInputStreamSource(io.prestosql.orc.stream.ValueInputStreamSource) InputStreamSources(io.prestosql.orc.stream.InputStreamSources) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Map(java.util.Map) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ImmutableSet(com.google.common.collect.ImmutableSet) OrcTypeKind(io.prestosql.orc.metadata.OrcType.OrcTypeKind) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) HiveWriterVersion(io.prestosql.orc.metadata.PostScript.HiveWriterVersion) Set(java.util.Set) DICTIONARY_DATA(io.prestosql.orc.metadata.Stream.StreamKind.DICTIONARY_DATA) Checkpoints.getStreamCheckpoints(io.prestosql.orc.checkpoint.Checkpoints.getStreamCheckpoints) ZoneId(java.time.ZoneId) Preconditions.checkState(com.google.common.base.Preconditions.checkState) MetadataReader(io.prestosql.orc.metadata.MetadataReader) StripeInformation(io.prestosql.orc.metadata.StripeInformation) InputStreamSource(io.prestosql.orc.stream.InputStreamSource) DICTIONARY(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) BLOOM_FILTER_UTF8(io.prestosql.orc.metadata.Stream.StreamKind.BLOOM_FILTER_UTF8) Entry(java.util.Map.Entry) Optional(java.util.Optional) InvalidCheckpointException(io.prestosql.orc.checkpoint.InvalidCheckpointException) DICTIONARY_V2(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2) Slice(io.airlift.slice.Slice) OrcChunkLoader(io.prestosql.orc.stream.OrcChunkLoader) Logger(io.airlift.log.Logger) ColumnEncodingKind(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind) DICTIONARY_COUNT(io.prestosql.orc.metadata.Stream.StreamKind.DICTIONARY_COUNT) HashMap(java.util.HashMap) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicates(com.google.common.base.Predicates) Math.toIntExact(java.lang.Math.toIntExact) LinkedHashSet(java.util.LinkedHashSet) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) ROW_INDEX(io.prestosql.orc.metadata.Stream.StreamKind.ROW_INDEX) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) OrcType(io.prestosql.orc.metadata.OrcType) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint) IOException(java.io.IOException) Maps(com.google.common.collect.Maps) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) Stream(io.prestosql.orc.metadata.Stream) BLOOM_FILTER(io.prestosql.orc.metadata.Stream.StreamKind.BLOOM_FILTER) ExecutionException(java.util.concurrent.ExecutionException) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) ValueStreams(io.prestosql.orc.stream.ValueStreams) OrcReader.handleCacheLoadException(io.prestosql.orc.OrcReader.handleCacheLoadException) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) InputStream(java.io.InputStream) LENGTH(io.prestosql.orc.metadata.Stream.StreamKind.LENGTH) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint)

Example 5 with Stream

use of io.prestosql.orc.metadata.Stream in project hetu-core by openlookeng.

the class FloatColumnWriter method getIndexStreams.

@Override
public List<StreamDataOutput> getIndexStreams(CompressedMetadataWriter metadataWriter) throws IOException {
    checkState(closed);
    ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
    List<FloatStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints();
    Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
    for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
        int groupId = i;
        ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
        FloatStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId);
        Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
        List<Integer> positions = createFloatColumnPositionList(compressed, dataCheckpoint, presentCheckpoint);
        rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
    }
    Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
    Stream stream = new Stream(columnId, StreamKind.ROW_INDEX, slice.length(), false);
    return ImmutableList.of(new StreamDataOutput(slice, stream));
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) BooleanStreamCheckpoint(io.prestosql.orc.checkpoint.BooleanStreamCheckpoint) ImmutableList(com.google.common.collect.ImmutableList) FloatStreamCheckpoint(io.prestosql.orc.checkpoint.FloatStreamCheckpoint) StreamDataOutput(io.prestosql.orc.stream.StreamDataOutput) BooleanStreamCheckpoint(io.prestosql.orc.checkpoint.BooleanStreamCheckpoint) FloatStreamCheckpoint(io.prestosql.orc.checkpoint.FloatStreamCheckpoint) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) Slice(io.airlift.slice.Slice) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) PresentOutputStream(io.prestosql.orc.stream.PresentOutputStream) Stream(io.prestosql.orc.metadata.Stream) FloatOutputStream(io.prestosql.orc.stream.FloatOutputStream)

Aggregations

Stream (io.prestosql.orc.metadata.Stream)27 ArrayList (java.util.ArrayList)20 List (java.util.List)20 ImmutableList (com.google.common.collect.ImmutableList)19 ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)18 Slice (io.airlift.slice.Slice)17 RowGroupIndex (io.prestosql.orc.metadata.RowGroupIndex)16 BooleanStreamCheckpoint (io.prestosql.orc.checkpoint.BooleanStreamCheckpoint)14 StreamDataOutput (io.prestosql.orc.stream.StreamDataOutput)14 PresentOutputStream (io.prestosql.orc.stream.PresentOutputStream)12 OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)11 OrcInputStream (io.prestosql.orc.stream.OrcInputStream)9 InputStream (java.io.InputStream)9 ImmutableMap (com.google.common.collect.ImmutableMap)8 ValueInputStream (io.prestosql.orc.stream.ValueInputStream)8 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)7 LongStreamCheckpoint (io.prestosql.orc.checkpoint.LongStreamCheckpoint)7 LongOutputStream (io.prestosql.orc.stream.LongOutputStream)7 StreamCheckpoint (io.prestosql.orc.checkpoint.StreamCheckpoint)6 HashMap (java.util.HashMap)6