Search in sources :

Example 1 with ColumnStatistics

use of io.prestosql.orc.metadata.statistics.ColumnStatistics in project hetu-core by openlookeng.

the class StripeReader method readColumnIndexes.

private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData, Map<OrcColumnId, List<HashableBloomFilter>> bloomFilterIndexes, StripeInformation stripe) throws IOException {
    ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == ROW_INDEX) {
            OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
            List<HashableBloomFilter> bloomFilters = bloomFilterIndexes.get(entry.getKey().getColumnId());
            List<RowGroupIndex> rowGroupIndexes;
            if (orcCacheProperties.isRowIndexCacheEnabled()) {
                OrcRowIndexCacheKey indexCacheKey = new OrcRowIndexCacheKey();
                indexCacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
                indexCacheKey.setStripeOffset(stripe.getOffset());
                indexCacheKey.setStreamId(entry.getKey());
                try {
                    rowGroupIndexes = orcCacheStore.getRowIndexCache().get(indexCacheKey, () -> metadataReader.readRowIndexes(hiveWriterVersion, inputStream));
                } catch (UncheckedExecutionException | ExecutionException executionException) {
                    handleCacheLoadException(executionException);
                    log.debug(executionException.getCause(), "Error while caching row group indexes. Falling back to default flow");
                    rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
                }
            } else {
                rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
            }
            if (bloomFilters != null && !bloomFilters.isEmpty()) {
                ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder();
                for (int i = 0; i < rowGroupIndexes.size(); i++) {
                    RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i);
                    ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics().withBloomFilter(bloomFilters.get(i));
                    newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics));
                }
                rowGroupIndexes = newRowGroupIndexes.build();
            }
            columnIndexes.put(entry.getKey(), rowGroupIndexes);
        }
    }
    return columnIndexes.build();
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ImmutableList(com.google.common.collect.ImmutableList) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) Stream(io.prestosql.orc.metadata.Stream) InputStream(java.io.InputStream) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ExecutionException(java.util.concurrent.ExecutionException)

Example 2 with ColumnStatistics

use of io.prestosql.orc.metadata.statistics.ColumnStatistics in project hetu-core by openlookeng.

the class StripeReader method getRowGroupStatistics.

private static ColumnMetadata<ColumnStatistics> getRowGroupStatistics(ColumnMetadata<OrcType> types, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) {
    requireNonNull(columnIndexes, "columnIndexes is null");
    checkArgument(rowGroup >= 0, "rowGroup is negative");
    Map<Integer, List<RowGroupIndex>> rowGroupIndexesByColumn = columnIndexes.entrySet().stream().collect(toImmutableMap(entry -> entry.getKey().getColumnId().getId(), Entry::getValue));
    List<ColumnStatistics> statistics = new ArrayList<>(types.size());
    for (int columnIndex = 0; columnIndex < types.size(); columnIndex++) {
        List<RowGroupIndex> rowGroupIndexes = rowGroupIndexesByColumn.get(columnIndex);
        if (rowGroupIndexes != null) {
            statistics.add(rowGroupIndexes.get(rowGroup).getColumnStatistics());
        } else {
            statistics.add(null);
        }
    }
    return new ColumnMetadata<>(statistics);
}
Also used : CheckpointInputStreamSource.createCheckpointStreamSource(io.prestosql.orc.stream.CheckpointInputStreamSource.createCheckpointStreamSource) OrcDataReader(io.prestosql.orc.stream.OrcDataReader) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ValueInputStreamSource(io.prestosql.orc.stream.ValueInputStreamSource) InputStreamSources(io.prestosql.orc.stream.InputStreamSources) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Map(java.util.Map) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ImmutableSet(com.google.common.collect.ImmutableSet) OrcTypeKind(io.prestosql.orc.metadata.OrcType.OrcTypeKind) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) HiveWriterVersion(io.prestosql.orc.metadata.PostScript.HiveWriterVersion) Set(java.util.Set) DICTIONARY_DATA(io.prestosql.orc.metadata.Stream.StreamKind.DICTIONARY_DATA) Checkpoints.getStreamCheckpoints(io.prestosql.orc.checkpoint.Checkpoints.getStreamCheckpoints) ZoneId(java.time.ZoneId) Preconditions.checkState(com.google.common.base.Preconditions.checkState) MetadataReader(io.prestosql.orc.metadata.MetadataReader) StripeInformation(io.prestosql.orc.metadata.StripeInformation) InputStreamSource(io.prestosql.orc.stream.InputStreamSource) DICTIONARY(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) BLOOM_FILTER_UTF8(io.prestosql.orc.metadata.Stream.StreamKind.BLOOM_FILTER_UTF8) Entry(java.util.Map.Entry) Optional(java.util.Optional) InvalidCheckpointException(io.prestosql.orc.checkpoint.InvalidCheckpointException) DICTIONARY_V2(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2) Slice(io.airlift.slice.Slice) OrcChunkLoader(io.prestosql.orc.stream.OrcChunkLoader) Logger(io.airlift.log.Logger) ColumnEncodingKind(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind) DICTIONARY_COUNT(io.prestosql.orc.metadata.Stream.StreamKind.DICTIONARY_COUNT) HashMap(java.util.HashMap) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicates(com.google.common.base.Predicates) Math.toIntExact(java.lang.Math.toIntExact) LinkedHashSet(java.util.LinkedHashSet) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) ROW_INDEX(io.prestosql.orc.metadata.Stream.StreamKind.ROW_INDEX) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) OrcType(io.prestosql.orc.metadata.OrcType) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint) IOException(java.io.IOException) Maps(com.google.common.collect.Maps) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) Stream(io.prestosql.orc.metadata.Stream) BLOOM_FILTER(io.prestosql.orc.metadata.Stream.StreamKind.BLOOM_FILTER) ExecutionException(java.util.concurrent.ExecutionException) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) ValueStreams(io.prestosql.orc.stream.ValueStreams) OrcReader.handleCacheLoadException(io.prestosql.orc.OrcReader.handleCacheLoadException) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) InputStream(java.io.InputStream) LENGTH(io.prestosql.orc.metadata.Stream.StreamKind.LENGTH) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint)

Example 3 with ColumnStatistics

use of io.prestosql.orc.metadata.statistics.ColumnStatistics in project hetu-core by openlookeng.

the class FloatColumnWriter method getIndexStreams.

@Override
public List<StreamDataOutput> getIndexStreams(CompressedMetadataWriter metadataWriter) throws IOException {
    checkState(closed);
    ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
    List<FloatStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints();
    Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
    for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
        int groupId = i;
        ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
        FloatStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId);
        Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
        List<Integer> positions = createFloatColumnPositionList(compressed, dataCheckpoint, presentCheckpoint);
        rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
    }
    Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
    Stream stream = new Stream(columnId, StreamKind.ROW_INDEX, slice.length(), false);
    return ImmutableList.of(new StreamDataOutput(slice, stream));
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) BooleanStreamCheckpoint(io.prestosql.orc.checkpoint.BooleanStreamCheckpoint) ImmutableList(com.google.common.collect.ImmutableList) FloatStreamCheckpoint(io.prestosql.orc.checkpoint.FloatStreamCheckpoint) StreamDataOutput(io.prestosql.orc.stream.StreamDataOutput) BooleanStreamCheckpoint(io.prestosql.orc.checkpoint.BooleanStreamCheckpoint) FloatStreamCheckpoint(io.prestosql.orc.checkpoint.FloatStreamCheckpoint) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) Slice(io.airlift.slice.Slice) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) PresentOutputStream(io.prestosql.orc.stream.PresentOutputStream) Stream(io.prestosql.orc.metadata.Stream) FloatOutputStream(io.prestosql.orc.stream.FloatOutputStream)

Example 4 with ColumnStatistics

use of io.prestosql.orc.metadata.statistics.ColumnStatistics in project hetu-core by openlookeng.

the class FloatColumnWriter method finishRowGroup.

@Override
public Map<OrcColumnId, ColumnStatistics> finishRowGroup() {
    checkState(!closed);
    ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
    rowGroupColumnStatistics.add(statistics);
    statisticsBuilder = new DoubleStatisticsBuilder();
    return ImmutableMap.of(columnId, statistics);
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) DoubleStatisticsBuilder(io.prestosql.orc.metadata.statistics.DoubleStatisticsBuilder)

Example 5 with ColumnStatistics

use of io.prestosql.orc.metadata.statistics.ColumnStatistics in project hetu-core by openlookeng.

the class ListColumnWriter method getIndexStreams.

@Override
public List<StreamDataOutput> getIndexStreams(CompressedMetadataWriter metadataWriter) throws IOException {
    checkState(closed);
    ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
    List<LongStreamCheckpoint> lengthCheckpoints = lengthStream.getCheckpoints();
    Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
    for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
        int groupId = i;
        ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
        LongStreamCheckpoint lengthCheckpoint = lengthCheckpoints.get(groupId);
        Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
        List<Integer> positions = createArrayColumnPositionList(compressed, lengthCheckpoint, presentCheckpoint);
        rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
    }
    Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
    Stream stream = new Stream(columnId, StreamKind.ROW_INDEX, slice.length(), false);
    ImmutableList.Builder<StreamDataOutput> indexStreams = ImmutableList.builder();
    indexStreams.add(new StreamDataOutput(slice, stream));
    indexStreams.addAll(elementWriter.getIndexStreams(metadataWriter));
    return indexStreams.build();
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) BooleanStreamCheckpoint(io.prestosql.orc.checkpoint.BooleanStreamCheckpoint) ImmutableList(com.google.common.collect.ImmutableList) StreamDataOutput(io.prestosql.orc.stream.StreamDataOutput) LongStreamCheckpoint(io.prestosql.orc.checkpoint.LongStreamCheckpoint) BooleanStreamCheckpoint(io.prestosql.orc.checkpoint.BooleanStreamCheckpoint) LongStreamCheckpoint(io.prestosql.orc.checkpoint.LongStreamCheckpoint) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) Slice(io.airlift.slice.Slice) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) LongOutputStream(io.prestosql.orc.stream.LongOutputStream) LongOutputStream.createLengthOutputStream(io.prestosql.orc.stream.LongOutputStream.createLengthOutputStream) PresentOutputStream(io.prestosql.orc.stream.PresentOutputStream) Stream(io.prestosql.orc.metadata.Stream)

Aggregations

ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)37 ArrayList (java.util.ArrayList)20 ImmutableList (com.google.common.collect.ImmutableList)19 List (java.util.List)19 Slice (io.airlift.slice.Slice)18 Stream (io.prestosql.orc.metadata.Stream)18 RowGroupIndex (io.prestosql.orc.metadata.RowGroupIndex)15 StreamDataOutput (io.prestosql.orc.stream.StreamDataOutput)15 BooleanStreamCheckpoint (io.prestosql.orc.checkpoint.BooleanStreamCheckpoint)12 PresentOutputStream (io.prestosql.orc.stream.PresentOutputStream)12 OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)11 LongStreamCheckpoint (io.prestosql.orc.checkpoint.LongStreamCheckpoint)7 LongOutputStream (io.prestosql.orc.stream.LongOutputStream)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 ColumnMetadata (io.prestosql.orc.metadata.ColumnMetadata)5 ImmutableSet (com.google.common.collect.ImmutableSet)4 Slices.utf8Slice (io.airlift.slice.Slices.utf8Slice)4 ColumnEncoding (io.prestosql.orc.metadata.ColumnEncoding)4 StripeFooter (io.prestosql.orc.metadata.StripeFooter)4 HashMap (java.util.HashMap)4