Search in sources :

Example 1 with ColumnMetadata

use of io.prestosql.orc.metadata.ColumnMetadata in project hetu-core by openlookeng.

the class StripeReader method getRowGroupStatistics.

private static ColumnMetadata<ColumnStatistics> getRowGroupStatistics(ColumnMetadata<OrcType> types, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) {
    requireNonNull(columnIndexes, "columnIndexes is null");
    checkArgument(rowGroup >= 0, "rowGroup is negative");
    Map<Integer, List<RowGroupIndex>> rowGroupIndexesByColumn = columnIndexes.entrySet().stream().collect(toImmutableMap(entry -> entry.getKey().getColumnId().getId(), Entry::getValue));
    List<ColumnStatistics> statistics = new ArrayList<>(types.size());
    for (int columnIndex = 0; columnIndex < types.size(); columnIndex++) {
        List<RowGroupIndex> rowGroupIndexes = rowGroupIndexesByColumn.get(columnIndex);
        if (rowGroupIndexes != null) {
            statistics.add(rowGroupIndexes.get(rowGroup).getColumnStatistics());
        } else {
            statistics.add(null);
        }
    }
    return new ColumnMetadata<>(statistics);
}
Also used : CheckpointInputStreamSource.createCheckpointStreamSource(io.prestosql.orc.stream.CheckpointInputStreamSource.createCheckpointStreamSource) OrcDataReader(io.prestosql.orc.stream.OrcDataReader) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ValueInputStreamSource(io.prestosql.orc.stream.ValueInputStreamSource) InputStreamSources(io.prestosql.orc.stream.InputStreamSources) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Map(java.util.Map) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ImmutableSet(com.google.common.collect.ImmutableSet) OrcTypeKind(io.prestosql.orc.metadata.OrcType.OrcTypeKind) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) HiveWriterVersion(io.prestosql.orc.metadata.PostScript.HiveWriterVersion) Set(java.util.Set) DICTIONARY_DATA(io.prestosql.orc.metadata.Stream.StreamKind.DICTIONARY_DATA) Checkpoints.getStreamCheckpoints(io.prestosql.orc.checkpoint.Checkpoints.getStreamCheckpoints) ZoneId(java.time.ZoneId) Preconditions.checkState(com.google.common.base.Preconditions.checkState) MetadataReader(io.prestosql.orc.metadata.MetadataReader) StripeInformation(io.prestosql.orc.metadata.StripeInformation) InputStreamSource(io.prestosql.orc.stream.InputStreamSource) DICTIONARY(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) BLOOM_FILTER_UTF8(io.prestosql.orc.metadata.Stream.StreamKind.BLOOM_FILTER_UTF8) Entry(java.util.Map.Entry) Optional(java.util.Optional) InvalidCheckpointException(io.prestosql.orc.checkpoint.InvalidCheckpointException) DICTIONARY_V2(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2) Slice(io.airlift.slice.Slice) OrcChunkLoader(io.prestosql.orc.stream.OrcChunkLoader) Logger(io.airlift.log.Logger) ColumnEncodingKind(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind) DICTIONARY_COUNT(io.prestosql.orc.metadata.Stream.StreamKind.DICTIONARY_COUNT) HashMap(java.util.HashMap) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicates(com.google.common.base.Predicates) Math.toIntExact(java.lang.Math.toIntExact) LinkedHashSet(java.util.LinkedHashSet) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) ROW_INDEX(io.prestosql.orc.metadata.Stream.StreamKind.ROW_INDEX) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) OrcType(io.prestosql.orc.metadata.OrcType) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint) IOException(java.io.IOException) Maps(com.google.common.collect.Maps) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) Stream(io.prestosql.orc.metadata.Stream) BLOOM_FILTER(io.prestosql.orc.metadata.Stream.StreamKind.BLOOM_FILTER) ExecutionException(java.util.concurrent.ExecutionException) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) ValueStreams(io.prestosql.orc.stream.ValueStreams) OrcReader.handleCacheLoadException(io.prestosql.orc.OrcReader.handleCacheLoadException) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) InputStream(java.io.InputStream) LENGTH(io.prestosql.orc.metadata.Stream.StreamKind.LENGTH) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint)

Example 2 with ColumnMetadata

use of io.prestosql.orc.metadata.ColumnMetadata in project hetu-core by openlookeng.

the class AbstractOrcRecordReader method close.

@Override
public void close() throws IOException {
    try (Closer closer = Closer.create()) {
        closer.register(orcDataSource);
        for (AbstractColumnReader column : columnReaders) {
            if (column != null) {
                closer.register(column::close);
            }
        }
    }
    if (writeChecksumBuilder.isPresent()) {
        OrcWriteValidation.WriteChecksum actualChecksum = writeChecksumBuilder.get().build();
        validateWrite(validation -> validation.getChecksum().getTotalRowCount() == actualChecksum.getTotalRowCount(), "Invalid row count");
        List<Long> columnHashes = actualChecksum.getColumnHashes();
        for (int i = 0; i < columnHashes.size(); i++) {
            int columnIndex = i;
            validateWrite(validation -> validation.getChecksum().getColumnHashes().get(columnIndex).equals(columnHashes.get(columnIndex)), "Invalid checksum for column %s", columnIndex);
        }
        validateWrite(validation -> validation.getChecksum().getStripeHash() == actualChecksum.getStripeHash(), "Invalid stripes checksum");
    }
    if (fileStatisticsValidation.isPresent()) {
        Optional<ColumnMetadata<ColumnStatistics>> columnStatistics = fileStatisticsValidation.get().build();
        writeValidation.get().validateFileStatistics(orcDataSource.getId(), columnStatistics);
    }
}
Also used : Closer(com.google.common.io.Closer) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) Comparator.comparingLong(java.util.Comparator.comparingLong) AbstractColumnReader(io.prestosql.orc.reader.AbstractColumnReader)

Example 3 with ColumnMetadata

use of io.prestosql.orc.metadata.ColumnMetadata in project hetu-core by openlookeng.

the class TestOrcBloomFilters method testMatches.

@Test
public // simulate query on a 2 columns where 1 is used as part of the where, with and without bloom filter
void testMatches() {
    TupleDomainOrcPredicate predicate = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(true).addColumn(ROOT_COLUMN, Domain.singleValue(BIGINT, 1234L)).build();
    TupleDomainOrcPredicate emptyPredicate = TupleDomainOrcPredicate.builder().build();
    // assemble a matching and a non-matching bloom filter
    HashableBloomFilter bloomFilter = new HashableBloomFilter(1000, 0.01);
    OrcProto.BloomFilter emptyOrcBloomFilter = toOrcBloomFilter(bloomFilter);
    bloomFilter.addLong(1234);
    OrcProto.BloomFilter orcBloomFilter = toOrcBloomFilter(bloomFilter);
    ColumnMetadata<ColumnStatistics> matchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, toBloomFilter(orcBloomFilter))));
    ColumnMetadata<ColumnStatistics> nonMatchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, toBloomFilter(emptyOrcBloomFilter))));
    ColumnMetadata<ColumnStatistics> withoutBloomFilterStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, null)));
    assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex));
    assertTrue(predicate.matches(1L, withoutBloomFilterStatisticsByColumnIndex));
    assertFalse(predicate.matches(1L, nonMatchingStatisticsByColumnIndex));
    assertTrue(emptyPredicate.matches(1L, matchingStatisticsByColumnIndex));
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) OrcProto(io.prestosql.orc.proto.OrcProto) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 4 with ColumnMetadata

use of io.prestosql.orc.metadata.ColumnMetadata in project hetu-core by openlookeng.

the class OrcWriter method toFileStats.

private static Optional<ColumnMetadata<ColumnStatistics>> toFileStats(List<ColumnMetadata<ColumnStatistics>> stripes) {
    if (stripes.isEmpty()) {
        return Optional.empty();
    }
    int columnCount = stripes.get(0).size();
    checkArgument(stripes.stream().allMatch(stripe -> columnCount == stripe.size()));
    ImmutableList.Builder<ColumnStatistics> fileStats = ImmutableList.builder();
    for (int i = 0; i < columnCount; i++) {
        OrcColumnId columnId = new OrcColumnId(i);
        fileStats.add(ColumnStatistics.mergeColumnStatistics(stripes.stream().map(stripe -> stripe.get(columnId)).collect(toList())));
    }
    return Optional.of(new ColumnMetadata<>(fileStats.build()));
}
Also used : Footer(io.prestosql.orc.metadata.Footer) CLOSED(io.prestosql.orc.OrcWriterStats.FlushReason.CLOSED) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Slices(io.airlift.slice.Slices) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Map(java.util.Map) ColumnWriters.createColumnWriter(io.prestosql.orc.writer.ColumnWriters.createColumnWriter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Type(io.prestosql.spi.type.Type) Metadata(io.prestosql.orc.metadata.Metadata) ImmutableSet(com.google.common.collect.ImmutableSet) OrcMetadataWriter(io.prestosql.orc.metadata.OrcMetadataWriter) OrcDataOutput(io.prestosql.orc.stream.OrcDataOutput) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) OrcWriteValidationMode(io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode) MAX_BYTES(io.prestosql.orc.OrcWriterStats.FlushReason.MAX_BYTES) StreamDataOutput(io.prestosql.orc.stream.StreamDataOutput) Collectors(java.util.stream.Collectors) ZoneId(java.time.ZoneId) Preconditions.checkState(com.google.common.base.Preconditions.checkState) StripeInformation(io.prestosql.orc.metadata.StripeInformation) List(java.util.List) ClassLayout(org.openjdk.jol.info.ClassLayout) CompressionKind(io.prestosql.orc.metadata.CompressionKind) Entry(java.util.Map.Entry) Optional(java.util.Optional) FlushReason(io.prestosql.orc.OrcWriterStats.FlushReason) OrcWriteValidationBuilder(io.prestosql.orc.OrcWriteValidation.OrcWriteValidationBuilder) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) MAX_ROWS(io.prestosql.orc.OrcWriterStats.FlushReason.MAX_ROWS) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnWriter(io.prestosql.orc.writer.ColumnWriter) ROOT_COLUMN(io.prestosql.orc.metadata.OrcColumnId.ROOT_COLUMN) Math.toIntExact(java.lang.Math.toIntExact) Nullable(javax.annotation.Nullable) Integer.min(java.lang.Integer.min) OrcReader.validateFile(io.prestosql.orc.OrcReader.validateFile) UnsignedBytes(com.google.common.primitives.UnsignedBytes) DICTIONARY_FULL(io.prestosql.orc.OrcWriterStats.FlushReason.DICTIONARY_FULL) SliceDictionaryColumnWriter(io.prestosql.orc.writer.SliceDictionaryColumnWriter) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) OrcType(io.prestosql.orc.metadata.OrcType) OrcDataOutput.createDataOutput(io.prestosql.orc.stream.OrcDataOutput.createDataOutput) Page(io.prestosql.spi.Page) IOException(java.io.IOException) DIRECT(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) Stream(io.prestosql.orc.metadata.Stream) Consumer(java.util.function.Consumer) Collectors.toList(java.util.stream.Collectors.toList) Closeable(java.io.Closeable) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) CompressedMetadataWriter(io.prestosql.orc.metadata.CompressedMetadataWriter) Collections(java.util.Collections) MAGIC(io.prestosql.orc.metadata.PostScript.MAGIC) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList)

Example 5 with ColumnMetadata

use of io.prestosql.orc.metadata.ColumnMetadata in project hetu-core by openlookeng.

the class OrcWriter method bufferFileFooter.

/**
 * Collect the data for for the file footer.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<OrcDataOutput> bufferFileFooter() throws IOException {
    if (preCloseCallback.isPresent()) {
        try {
            preCloseCallback.get().call();
        } catch (Exception e) {
            log.debug("Call pre close call back error");
        }
    }
    List<OrcDataOutput> outputData = new ArrayList<>();
    Metadata metadata = new Metadata(closedStripes.stream().map(ClosedStripe::getStatistics).map(Optional::of).collect(toList()));
    Slice metadataSlice = metadataWriter.writeMetadata(metadata);
    outputData.add(createDataOutput(metadataSlice));
    long numberOfRows = closedStripes.stream().mapToLong(stripe -> stripe.getStripeInformation().getNumberOfRows()).sum();
    Optional<ColumnMetadata<ColumnStatistics>> fileStats = toFileStats(closedStripes.stream().map(ClosedStripe::getStatistics).map(StripeStatistics::getColumnStatistics).collect(toList()));
    recordValidation(validation -> validation.setFileStatistics(fileStats));
    Map<String, Slice> localUserMetadata = this.userMetadata.entrySet().stream().collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue())));
    Footer footer = new Footer(numberOfRows, rowGroupMaxRowCount, closedStripes.stream().map(ClosedStripe::getStripeInformation).collect(toImmutableList()), orcTypes, fileStats, localUserMetadata);
    closedStripes.clear();
    closedStripesRetainedBytes = 0;
    Slice footerSlice = metadataWriter.writeFooter(footer);
    outputData.add(createDataOutput(footerSlice));
    recordValidation(validation -> validation.setVersion(metadataWriter.getOrcMetadataVersion()));
    Slice postscriptSlice = metadataWriter.writePostscript(footerSlice.length(), metadataSlice.length(), compression, maxCompressionBufferSize);
    outputData.add(createDataOutput(postscriptSlice));
    outputData.add(createDataOutput(Slices.wrappedBuffer(UnsignedBytes.checkedCast(postscriptSlice.length()))));
    return outputData;
}
Also used : Footer(io.prestosql.orc.metadata.Footer) CLOSED(io.prestosql.orc.OrcWriterStats.FlushReason.CLOSED) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Slices(io.airlift.slice.Slices) StripeFooter(io.prestosql.orc.metadata.StripeFooter) Map(java.util.Map) ColumnWriters.createColumnWriter(io.prestosql.orc.writer.ColumnWriters.createColumnWriter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Type(io.prestosql.spi.type.Type) Metadata(io.prestosql.orc.metadata.Metadata) ImmutableSet(com.google.common.collect.ImmutableSet) OrcMetadataWriter(io.prestosql.orc.metadata.OrcMetadataWriter) OrcDataOutput(io.prestosql.orc.stream.OrcDataOutput) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) OrcWriteValidationMode(io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode) MAX_BYTES(io.prestosql.orc.OrcWriterStats.FlushReason.MAX_BYTES) StreamDataOutput(io.prestosql.orc.stream.StreamDataOutput) Collectors(java.util.stream.Collectors) ZoneId(java.time.ZoneId) Preconditions.checkState(com.google.common.base.Preconditions.checkState) StripeInformation(io.prestosql.orc.metadata.StripeInformation) List(java.util.List) ClassLayout(org.openjdk.jol.info.ClassLayout) CompressionKind(io.prestosql.orc.metadata.CompressionKind) Entry(java.util.Map.Entry) Optional(java.util.Optional) FlushReason(io.prestosql.orc.OrcWriterStats.FlushReason) OrcWriteValidationBuilder(io.prestosql.orc.OrcWriteValidation.OrcWriteValidationBuilder) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) MAX_ROWS(io.prestosql.orc.OrcWriterStats.FlushReason.MAX_ROWS) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnWriter(io.prestosql.orc.writer.ColumnWriter) ROOT_COLUMN(io.prestosql.orc.metadata.OrcColumnId.ROOT_COLUMN) Math.toIntExact(java.lang.Math.toIntExact) Nullable(javax.annotation.Nullable) Integer.min(java.lang.Integer.min) OrcReader.validateFile(io.prestosql.orc.OrcReader.validateFile) UnsignedBytes(com.google.common.primitives.UnsignedBytes) DICTIONARY_FULL(io.prestosql.orc.OrcWriterStats.FlushReason.DICTIONARY_FULL) SliceDictionaryColumnWriter(io.prestosql.orc.writer.SliceDictionaryColumnWriter) ColumnEncoding(io.prestosql.orc.metadata.ColumnEncoding) OrcType(io.prestosql.orc.metadata.OrcType) OrcDataOutput.createDataOutput(io.prestosql.orc.stream.OrcDataOutput.createDataOutput) Page(io.prestosql.spi.Page) IOException(java.io.IOException) DIRECT(io.prestosql.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) Stream(io.prestosql.orc.metadata.Stream) Consumer(java.util.function.Consumer) Collectors.toList(java.util.stream.Collectors.toList) Closeable(java.io.Closeable) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) CompressedMetadataWriter(io.prestosql.orc.metadata.CompressedMetadataWriter) Collections(java.util.Collections) MAGIC(io.prestosql.orc.metadata.PostScript.MAGIC) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) Optional(java.util.Optional) ArrayList(java.util.ArrayList) Metadata(io.prestosql.orc.metadata.Metadata) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) OrcDataOutput(io.prestosql.orc.stream.OrcDataOutput) IOException(java.io.IOException) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) Footer(io.prestosql.orc.metadata.Footer) StripeFooter(io.prestosql.orc.metadata.StripeFooter)

Aggregations

ColumnMetadata (io.prestosql.orc.metadata.ColumnMetadata)7 ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)6 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)5 Logger (io.airlift.log.Logger)5 Slice (io.airlift.slice.Slice)5 OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)5 OrcType (io.prestosql.orc.metadata.OrcType)5 StripeInformation (io.prestosql.orc.metadata.StripeInformation)5 Preconditions.checkState (com.google.common.base.Preconditions.checkState)4 ImmutableList (com.google.common.collect.ImmutableList)4 ImmutableSet (com.google.common.collect.ImmutableSet)4 ColumnEncoding (io.prestosql.orc.metadata.ColumnEncoding)4 Stream (io.prestosql.orc.metadata.Stream)4 StripeFooter (io.prestosql.orc.metadata.StripeFooter)4 IOException (java.io.IOException)4 Math.toIntExact (java.lang.Math.toIntExact)4 ArrayList (java.util.ArrayList)4 List (java.util.List)4 Map (java.util.Map)4 Objects.requireNonNull (java.util.Objects.requireNonNull)4