Search in sources :

Example 11 with ColumnEncoding

use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.

the class OrcWriter method bufferStripeData.

/**
 * Collect the data for for the stripe.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<DataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
    if (stripeRowCount == 0) {
        return ImmutableList.of();
    }
    List<DataOutput> outputData = new ArrayList<>();
    List<Stream> unencryptedStreams = new ArrayList<>(columnWriters.size() * 3);
    Multimap<Integer, Stream> encryptedStreams = ArrayListMultimap.create();
    // get index streams
    long indexLength = 0;
    long offset = 0;
    int previousEncryptionGroup = -1;
    for (ColumnWriter columnWriter : columnWriters) {
        for (StreamDataOutput indexStream : columnWriter.getIndexStreams()) {
            // The ordering is critical because the stream only contain a length with no offset.
            // if the previous stream was part of a different encryption group, need to specify an offset so we know the column order
            outputData.add(indexStream);
            Optional<Integer> encryptionGroup = dwrfEncryptionInfo.getGroupByNodeId(indexStream.getStream().getColumn());
            if (encryptionGroup.isPresent()) {
                Stream stream = previousEncryptionGroup == encryptionGroup.get() ? indexStream.getStream() : indexStream.getStream().withOffset(offset);
                encryptedStreams.put(encryptionGroup.get(), stream);
                previousEncryptionGroup = encryptionGroup.get();
            } else {
                Stream stream = previousEncryptionGroup == -1 ? indexStream.getStream() : indexStream.getStream().withOffset(offset);
                unencryptedStreams.add(stream);
                previousEncryptionGroup = -1;
            }
            offset += indexStream.size();
            indexLength += indexStream.size();
        }
    }
    if (dwrfStripeCacheWriter.isPresent()) {
        dwrfStripeCacheWriter.get().addIndexStreams(ImmutableList.copyOf(outputData), indexLength);
    }
    // data streams (sorted by size)
    long dataLength = 0;
    List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
    for (ColumnWriter columnWriter : columnWriters) {
        List<StreamDataOutput> streams = columnWriter.getDataStreams();
        dataStreams.addAll(streams);
        dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
    }
    streamLayout.reorder(dataStreams);
    // add data streams
    for (StreamDataOutput dataStream : dataStreams) {
        // The ordering is critical because the stream only contains a length with no offset.
        // if the previous stream was part of a different encryption group, need to specify an offset so we know the column order
        outputData.add(dataStream);
        Optional<Integer> encryptionGroup = dwrfEncryptionInfo.getGroupByNodeId(dataStream.getStream().getColumn());
        if (encryptionGroup.isPresent()) {
            Stream stream = previousEncryptionGroup == encryptionGroup.get() ? dataStream.getStream() : dataStream.getStream().withOffset(offset);
            encryptedStreams.put(encryptionGroup.get(), stream);
            previousEncryptionGroup = encryptionGroup.get();
        } else {
            Stream stream = previousEncryptionGroup == -1 ? dataStream.getStream() : dataStream.getStream().withOffset(offset);
            unencryptedStreams.add(stream);
            previousEncryptionGroup = -1;
        }
        offset += dataStream.size();
    }
    Map<Integer, ColumnEncoding> columnEncodings = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
    Map<Integer, ColumnStatistics> columnStatistics = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
    // the 0th column is a struct column for the whole row
    columnEncodings.put(0, new ColumnEncoding(DIRECT, 0));
    columnStatistics.put(0, new ColumnStatistics((long) stripeRowCount, null));
    Map<Integer, ColumnEncoding> unencryptedColumnEncodings = columnEncodings.entrySet().stream().filter(entry -> !dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).isPresent()).collect(toImmutableMap(Entry::getKey, Entry::getValue));
    Map<Integer, ColumnEncoding> encryptedColumnEncodings = columnEncodings.entrySet().stream().filter(entry -> dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).isPresent()).collect(toImmutableMap(Entry::getKey, Entry::getValue));
    List<Slice> encryptedGroups = createEncryptedGroups(encryptedStreams, encryptedColumnEncodings);
    StripeFooter stripeFooter = new StripeFooter(unencryptedStreams, unencryptedColumnEncodings, encryptedGroups);
    Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
    outputData.add(createDataOutput(footer));
    dwrfStripeCacheWriter.ifPresent(stripeCacheWriter -> stripeCacheWriter.addStripeFooter(createDataOutput(footer)));
    // create final stripe statistics
    StripeStatistics statistics = new StripeStatistics(toDenseList(columnStatistics, orcTypes.size()));
    recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
    StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length(), OptionalLong.of(stripeRawSize), dwrfEncryptionInfo.getEncryptedKeyMetadatas());
    ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
    closedStripes.add(closedStripe);
    closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
    recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
    stats.recordStripeWritten(flushPolicy.getStripeMinBytes(), flushPolicy.getStripeMaxBytes(), dictionaryMaxMemoryBytes, flushReason, dictionaryCompressionOptimizer.getDictionaryMemoryBytes(), stripeInformation);
    return outputData;
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) Page(com.facebook.presto.common.Page) DateTimeZone(org.joda.time.DateTimeZone) DwrfMetadataWriter.toFileStatistics(com.facebook.presto.orc.metadata.DwrfMetadataWriter.toFileStatistics) StripeEncryptionGroup(com.facebook.presto.orc.metadata.StripeEncryptionGroup) StreamLayout(com.facebook.presto.orc.writer.StreamLayout) ColumnWriter(com.facebook.presto.orc.writer.ColumnWriter) DwrfEncryption(com.facebook.presto.orc.metadata.DwrfEncryption) DataSink(com.facebook.presto.common.io.DataSink) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) DictionaryColumnWriter(com.facebook.presto.orc.writer.DictionaryColumnWriter) DIRECT(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT) DwrfStripeCacheWriter(com.facebook.presto.orc.metadata.DwrfStripeCacheWriter) Slices(io.airlift.slice.Slices) Map(java.util.Map) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) CompressedMetadataWriter(com.facebook.presto.orc.metadata.CompressedMetadataWriter) Footer(com.facebook.presto.orc.metadata.Footer) UNENCRYPTED(com.facebook.presto.orc.DwrfEncryptionInfo.UNENCRYPTED) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Collectors(java.util.stream.Collectors) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) Preconditions.checkState(com.google.common.base.Preconditions.checkState) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) DataSize(io.airlift.units.DataSize) List(java.util.List) DwrfMetadataWriter.toStripeEncryptionGroup(com.facebook.presto.orc.metadata.DwrfMetadataWriter.toStripeEncryptionGroup) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ClassLayout(org.openjdk.jol.info.ClassLayout) DWRF(com.facebook.presto.orc.OrcEncoding.DWRF) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) Entry(java.util.Map.Entry) Optional(java.util.Optional) Metadata(com.facebook.presto.orc.metadata.Metadata) IntStream(java.util.stream.IntStream) Slice(io.airlift.slice.Slice) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) OrcWriteValidationMode(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationMode) HashMap(java.util.HashMap) CLOSED(com.facebook.presto.orc.FlushReason.CLOSED) Multimap(com.google.common.collect.Multimap) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) DynamicSliceOutput(io.airlift.slice.DynamicSliceOutput) OptionalLong(java.util.OptionalLong) ImmutableList(com.google.common.collect.ImmutableList) MAGIC(com.facebook.presto.orc.metadata.PostScript.MAGIC) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) DataOutput.createDataOutput(com.facebook.presto.common.io.DataOutput.createDataOutput) LastUsedCompressionBufferPool(com.facebook.presto.orc.writer.CompressionBufferPool.LastUsedCompressionBufferPool) OrcType(com.facebook.presto.orc.metadata.OrcType) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) Math.toIntExact(java.lang.Math.toIntExact) Type(com.facebook.presto.common.type.Type) Nullable(javax.annotation.Nullable) Integer.min(java.lang.Integer.min) ColumnWriters.createColumnWriter(com.facebook.presto.orc.writer.ColumnWriters.createColumnWriter) StripeStatistics(com.facebook.presto.orc.metadata.statistics.StripeStatistics) OrcReader.validateFile(com.facebook.presto.orc.OrcReader.validateFile) OrcWriteValidationBuilder(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationBuilder) IOException(java.io.IOException) DwrfStripeCacheData(com.facebook.presto.orc.metadata.DwrfStripeCacheData) Stream(com.facebook.presto.orc.metadata.Stream) Consumer(java.util.function.Consumer) EncryptionGroup(com.facebook.presto.orc.metadata.EncryptionGroup) Collectors.toList(java.util.stream.Collectors.toList) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) CompressionBufferPool(com.facebook.presto.orc.writer.CompressionBufferPool) Closeable(java.io.Closeable) DwrfProto(com.facebook.presto.orc.proto.DwrfProto) VisibleForTesting(com.google.common.annotations.VisibleForTesting) DataOutput(com.facebook.presto.common.io.DataOutput) DwrfEncryptionInfo.createNodeToGroupMap(com.facebook.presto.orc.DwrfEncryptionInfo.createNodeToGroupMap) DataOutput.createDataOutput(com.facebook.presto.common.io.DataOutput.createDataOutput) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) DataOutput(com.facebook.presto.common.io.DataOutput) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) IntStream(java.util.stream.IntStream) Stream(com.facebook.presto.orc.metadata.Stream) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) StripeStatistics(com.facebook.presto.orc.metadata.statistics.StripeStatistics) ColumnWriter(com.facebook.presto.orc.writer.ColumnWriter) DictionaryColumnWriter(com.facebook.presto.orc.writer.DictionaryColumnWriter) ColumnWriters.createColumnWriter(com.facebook.presto.orc.writer.ColumnWriters.createColumnWriter) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation)

Example 12 with ColumnEncoding

use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.

the class StripeReader method addIncludedStreams.

/**
 * Add streams that are in includedOrcColumns to the includedStreams map,
 * and return whether there were any rowGroupDictionaries
 */
private boolean addIncludedStreams(Map<Integer, ColumnEncoding> columnEncodings, List<Stream> streams, Map<StreamId, Stream> includedStreams) {
    boolean hasRowGroupDictionary = false;
    for (Stream stream : streams) {
        if (includedOrcColumns.contains(stream.getColumn())) {
            includedStreams.put(new StreamId(stream), stream);
            if (stream.getStreamKind() == StreamKind.IN_DICTIONARY) {
                ColumnEncoding columnEncoding = columnEncodings.get(stream.getColumn());
                if (columnEncoding.getColumnEncodingKind() == DICTIONARY) {
                    hasRowGroupDictionary = true;
                }
                Optional<SortedMap<Integer, DwrfSequenceEncoding>> additionalSequenceEncodings = columnEncoding.getAdditionalSequenceEncodings();
                if (additionalSequenceEncodings.isPresent() && additionalSequenceEncodings.get().values().stream().map(DwrfSequenceEncoding::getValueEncoding).anyMatch(encoding -> encoding.getColumnEncodingKind() == DICTIONARY)) {
                    hasRowGroupDictionary = true;
                }
            }
        }
    }
    return hasRowGroupDictionary;
}
Also used : ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) StripeEncryptionGroup(com.facebook.presto.orc.metadata.StripeEncryptionGroup) OrcTypeKind(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) INDEX(com.facebook.presto.orc.metadata.Stream.StreamArea.INDEX) RowGroupIndex(com.facebook.presto.orc.metadata.RowGroupIndex) Map(java.util.Map) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation) RuntimeStats(com.facebook.presto.common.RuntimeStats) StreamKind(com.facebook.presto.orc.metadata.Stream.StreamKind) InvalidCheckpointException(com.facebook.presto.orc.checkpoint.InvalidCheckpointException) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) InputStreamSource(com.facebook.presto.orc.stream.InputStreamSource) Collection(java.util.Collection) Set(java.util.Set) CheckpointInputStreamSource.createCheckpointStreamSource(com.facebook.presto.orc.stream.CheckpointInputStreamSource.createCheckpointStreamSource) ValueInputStreamSource(com.facebook.presto.orc.stream.ValueInputStreamSource) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) List(java.util.List) Entry(java.util.Map.Entry) InputStreamSources(com.facebook.presto.orc.stream.InputStreamSources) Optional(java.util.Optional) ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) DICTIONARY(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY) DwrfSequenceEncoding(com.facebook.presto.orc.metadata.DwrfSequenceEncoding) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) SortedMap(java.util.SortedMap) DICTIONARY_DATA(com.facebook.presto.orc.metadata.Stream.StreamKind.DICTIONARY_DATA) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) ROW_INDEX(com.facebook.presto.orc.metadata.Stream.StreamKind.ROW_INDEX) Slice(io.airlift.slice.Slice) LENGTH(com.facebook.presto.orc.metadata.Stream.StreamKind.LENGTH) HashMap(java.util.HashMap) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) DwrfMetadataReader.toStripeEncryptionGroup(com.facebook.presto.orc.metadata.DwrfMetadataReader.toStripeEncryptionGroup) Multimap(com.google.common.collect.Multimap) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) HiveWriterVersion(com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion) Objects.requireNonNull(java.util.Objects.requireNonNull) Predicates(com.google.common.base.Predicates) OrcType(com.facebook.presto.orc.metadata.OrcType) Math.toIntExact(java.lang.Math.toIntExact) ImmutableMultimap(com.google.common.collect.ImmutableMultimap) ColumnStatistics.mergeColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.mergeColumnStatistics) BLOOM_FILTER(com.facebook.presto.orc.metadata.Stream.StreamKind.BLOOM_FILTER) NOOP_ORC_LOCAL_MEMORY_CONTEXT(com.facebook.presto.orc.NoopOrcLocalMemoryContext.NOOP_ORC_LOCAL_MEMORY_CONTEXT) SharedBuffer(com.facebook.presto.orc.stream.SharedBuffer) ColumnEncodingKind(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind) DICTIONARY_V2(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2) STRUCT(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind.STRUCT) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) IOException(java.io.IOException) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) Maps(com.google.common.collect.Maps) Stream(com.facebook.presto.orc.metadata.Stream) Math.multiplyExact(java.lang.Math.multiplyExact) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) Checkpoints.getStreamCheckpoints(com.facebook.presto.orc.checkpoint.Checkpoints.getStreamCheckpoints) ValueStreams(com.facebook.presto.orc.stream.ValueStreams) VisibleForTesting(com.google.common.annotations.VisibleForTesting) MetadataReader(com.facebook.presto.orc.metadata.MetadataReader) InputStream(java.io.InputStream) SortedMap(java.util.SortedMap) ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) Stream(com.facebook.presto.orc.metadata.Stream) InputStream(java.io.InputStream) DwrfSequenceEncoding(com.facebook.presto.orc.metadata.DwrfSequenceEncoding)

Example 13 with ColumnEncoding

use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.

the class MapFlatSelectiveStreamReader method startStripe.

@Override
public void startStripe(Stripe stripe) throws IOException {
    presentStreamSource = missingStreamSource(BooleanInputStream.class);
    inMapStreamSources.clear();
    valueStreamDescriptors.clear();
    valueStreamReaders.clear();
    ColumnEncoding encoding = stripe.getColumnEncodings().get(baseValueStreamDescriptor.getStreamId());
    SortedMap<Integer, DwrfSequenceEncoding> additionalSequenceEncodings = Collections.emptySortedMap();
    // encoding or encoding.getAdditionalSequenceEncodings() may not be present when every map is empty or null
    if (encoding != null && encoding.getAdditionalSequenceEncodings().isPresent()) {
        additionalSequenceEncodings = encoding.getAdditionalSequenceEncodings().get();
    }
    keyIndices = ensureCapacity(keyIndices, additionalSequenceEncodings.size());
    keyCount = 0;
    // The ColumnEncoding with sequence ID 0 doesn't have any data associated with it
    int keyIndex = 0;
    for (Map.Entry<Integer, DwrfSequenceEncoding> entry : additionalSequenceEncodings.entrySet()) {
        if (!isRequiredKey(entry.getValue())) {
            keyIndex++;
            continue;
        }
        keyIndices[keyCount] = keyIndex;
        keyCount++;
        keyIndex++;
        int sequence = entry.getKey();
        inMapStreamSources.add(missingStreamSource(BooleanInputStream.class));
        StreamDescriptor valueStreamDescriptor = copyStreamDescriptorWithSequence(baseValueStreamDescriptor, sequence);
        valueStreamDescriptors.add(valueStreamDescriptor);
        SelectiveStreamReader valueStreamReader = SelectiveStreamReaders.createStreamReader(valueStreamDescriptor, ImmutableBiMap.of(), Optional.ofNullable(outputType).map(MapType::getValueType), ImmutableList.of(), hiveStorageTimeZone, options, legacyMapSubscript, systemMemoryContext.newOrcAggregatedMemoryContext());
        valueStreamReader.startStripe(stripe);
        valueStreamReaders.add(valueStreamReader);
    }
    keyBlock = getKeysBlock(ImmutableList.copyOf(additionalSequenceEncodings.values()));
    readOffset = 0;
    presentStream = null;
    rowGroupOpen = false;
}
Also used : ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) BooleanInputStream(com.facebook.presto.orc.stream.BooleanInputStream) Map(java.util.Map) SortedMap(java.util.SortedMap) ImmutableBiMap(com.google.common.collect.ImmutableBiMap) StreamDescriptor(com.facebook.presto.orc.StreamDescriptor) DwrfSequenceEncoding(com.facebook.presto.orc.metadata.DwrfSequenceEncoding)

Aggregations

ColumnEncoding (com.facebook.presto.orc.metadata.ColumnEncoding)13 StripeFooter (com.facebook.presto.orc.metadata.StripeFooter)6 Stream (com.facebook.presto.orc.metadata.Stream)5 ImmutableList (com.google.common.collect.ImmutableList)5 ImmutableMap (com.google.common.collect.ImmutableMap)5 HashMap (java.util.HashMap)5 List (java.util.List)5 StripeEncryptionGroup (com.facebook.presto.orc.metadata.StripeEncryptionGroup)4 StripeInformation (com.facebook.presto.orc.metadata.StripeInformation)4 ColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics)4 Slice (io.airlift.slice.Slice)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 InvalidCheckpointException (com.facebook.presto.orc.checkpoint.InvalidCheckpointException)3 ColumnEncodingKind (com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind)3 OrcType (com.facebook.presto.orc.metadata.OrcType)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)3 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)3 Preconditions.checkState (com.google.common.base.Preconditions.checkState)3 ImmutableSet (com.google.common.collect.ImmutableSet)3