Search in sources :

Example 16 with StreamDataOutput

use of com.facebook.presto.orc.stream.StreamDataOutput in project presto by prestodb.

the class OrcWriter method bufferStripeData.

/**
 * Collect the data for for the stripe.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<DataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
    if (stripeRowCount == 0) {
        return ImmutableList.of();
    }
    List<DataOutput> outputData = new ArrayList<>();
    List<Stream> unencryptedStreams = new ArrayList<>(columnWriters.size() * 3);
    Multimap<Integer, Stream> encryptedStreams = ArrayListMultimap.create();
    // get index streams
    long indexLength = 0;
    long offset = 0;
    int previousEncryptionGroup = -1;
    for (ColumnWriter columnWriter : columnWriters) {
        for (StreamDataOutput indexStream : columnWriter.getIndexStreams()) {
            // The ordering is critical because the stream only contain a length with no offset.
            // if the previous stream was part of a different encryption group, need to specify an offset so we know the column order
            outputData.add(indexStream);
            Optional<Integer> encryptionGroup = dwrfEncryptionInfo.getGroupByNodeId(indexStream.getStream().getColumn());
            if (encryptionGroup.isPresent()) {
                Stream stream = previousEncryptionGroup == encryptionGroup.get() ? indexStream.getStream() : indexStream.getStream().withOffset(offset);
                encryptedStreams.put(encryptionGroup.get(), stream);
                previousEncryptionGroup = encryptionGroup.get();
            } else {
                Stream stream = previousEncryptionGroup == -1 ? indexStream.getStream() : indexStream.getStream().withOffset(offset);
                unencryptedStreams.add(stream);
                previousEncryptionGroup = -1;
            }
            offset += indexStream.size();
            indexLength += indexStream.size();
        }
    }
    if (dwrfStripeCacheWriter.isPresent()) {
        dwrfStripeCacheWriter.get().addIndexStreams(ImmutableList.copyOf(outputData), indexLength);
    }
    // data streams (sorted by size)
    long dataLength = 0;
    List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
    for (ColumnWriter columnWriter : columnWriters) {
        List<StreamDataOutput> streams = columnWriter.getDataStreams();
        dataStreams.addAll(streams);
        dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
    }
    streamLayout.reorder(dataStreams);
    // add data streams
    for (StreamDataOutput dataStream : dataStreams) {
        // The ordering is critical because the stream only contains a length with no offset.
        // if the previous stream was part of a different encryption group, need to specify an offset so we know the column order
        outputData.add(dataStream);
        Optional<Integer> encryptionGroup = dwrfEncryptionInfo.getGroupByNodeId(dataStream.getStream().getColumn());
        if (encryptionGroup.isPresent()) {
            Stream stream = previousEncryptionGroup == encryptionGroup.get() ? dataStream.getStream() : dataStream.getStream().withOffset(offset);
            encryptedStreams.put(encryptionGroup.get(), stream);
            previousEncryptionGroup = encryptionGroup.get();
        } else {
            Stream stream = previousEncryptionGroup == -1 ? dataStream.getStream() : dataStream.getStream().withOffset(offset);
            unencryptedStreams.add(stream);
            previousEncryptionGroup = -1;
        }
        offset += dataStream.size();
    }
    Map<Integer, ColumnEncoding> columnEncodings = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
    Map<Integer, ColumnStatistics> columnStatistics = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
    // the 0th column is a struct column for the whole row
    columnEncodings.put(0, new ColumnEncoding(DIRECT, 0));
    columnStatistics.put(0, new ColumnStatistics((long) stripeRowCount, null));
    Map<Integer, ColumnEncoding> unencryptedColumnEncodings = columnEncodings.entrySet().stream().filter(entry -> !dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).isPresent()).collect(toImmutableMap(Entry::getKey, Entry::getValue));
    Map<Integer, ColumnEncoding> encryptedColumnEncodings = columnEncodings.entrySet().stream().filter(entry -> dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).isPresent()).collect(toImmutableMap(Entry::getKey, Entry::getValue));
    List<Slice> encryptedGroups = createEncryptedGroups(encryptedStreams, encryptedColumnEncodings);
    StripeFooter stripeFooter = new StripeFooter(unencryptedStreams, unencryptedColumnEncodings, encryptedGroups);
    Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
    outputData.add(createDataOutput(footer));
    dwrfStripeCacheWriter.ifPresent(stripeCacheWriter -> stripeCacheWriter.addStripeFooter(createDataOutput(footer)));
    // create final stripe statistics
    StripeStatistics statistics = new StripeStatistics(toDenseList(columnStatistics, orcTypes.size()));
    recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
    StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length(), OptionalLong.of(stripeRawSize), dwrfEncryptionInfo.getEncryptedKeyMetadatas());
    ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
    closedStripes.add(closedStripe);
    closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
    recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
    stats.recordStripeWritten(flushPolicy.getStripeMinBytes(), flushPolicy.getStripeMaxBytes(), dictionaryMaxMemoryBytes, flushReason, dictionaryCompressionOptimizer.getDictionaryMemoryBytes(), stripeInformation);
    return outputData;
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) Page(com.facebook.presto.common.Page) DateTimeZone(org.joda.time.DateTimeZone) DwrfMetadataWriter.toFileStatistics(com.facebook.presto.orc.metadata.DwrfMetadataWriter.toFileStatistics) StripeEncryptionGroup(com.facebook.presto.orc.metadata.StripeEncryptionGroup) StreamLayout(com.facebook.presto.orc.writer.StreamLayout) ColumnWriter(com.facebook.presto.orc.writer.ColumnWriter) DwrfEncryption(com.facebook.presto.orc.metadata.DwrfEncryption) DataSink(com.facebook.presto.common.io.DataSink) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) DictionaryColumnWriter(com.facebook.presto.orc.writer.DictionaryColumnWriter) DIRECT(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind.DIRECT) DwrfStripeCacheWriter(com.facebook.presto.orc.metadata.DwrfStripeCacheWriter) Slices(io.airlift.slice.Slices) Map(java.util.Map) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) CompressedMetadataWriter(com.facebook.presto.orc.metadata.CompressedMetadataWriter) Footer(com.facebook.presto.orc.metadata.Footer) UNENCRYPTED(com.facebook.presto.orc.DwrfEncryptionInfo.UNENCRYPTED) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Collectors(java.util.stream.Collectors) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) Preconditions.checkState(com.google.common.base.Preconditions.checkState) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) DataSize(io.airlift.units.DataSize) List(java.util.List) DwrfMetadataWriter.toStripeEncryptionGroup(com.facebook.presto.orc.metadata.DwrfMetadataWriter.toStripeEncryptionGroup) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ClassLayout(org.openjdk.jol.info.ClassLayout) DWRF(com.facebook.presto.orc.OrcEncoding.DWRF) CompressionKind(com.facebook.presto.orc.metadata.CompressionKind) Entry(java.util.Map.Entry) Optional(java.util.Optional) Metadata(com.facebook.presto.orc.metadata.Metadata) IntStream(java.util.stream.IntStream) Slice(io.airlift.slice.Slice) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) OrcWriteValidationMode(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationMode) HashMap(java.util.HashMap) CLOSED(com.facebook.presto.orc.FlushReason.CLOSED) Multimap(com.google.common.collect.Multimap) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) DynamicSliceOutput(io.airlift.slice.DynamicSliceOutput) OptionalLong(java.util.OptionalLong) ImmutableList(com.google.common.collect.ImmutableList) MAGIC(com.facebook.presto.orc.metadata.PostScript.MAGIC) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) DataOutput.createDataOutput(com.facebook.presto.common.io.DataOutput.createDataOutput) LastUsedCompressionBufferPool(com.facebook.presto.orc.writer.CompressionBufferPool.LastUsedCompressionBufferPool) OrcType(com.facebook.presto.orc.metadata.OrcType) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) Math.toIntExact(java.lang.Math.toIntExact) Type(com.facebook.presto.common.type.Type) Nullable(javax.annotation.Nullable) Integer.min(java.lang.Integer.min) ColumnWriters.createColumnWriter(com.facebook.presto.orc.writer.ColumnWriters.createColumnWriter) StripeStatistics(com.facebook.presto.orc.metadata.statistics.StripeStatistics) OrcReader.validateFile(com.facebook.presto.orc.OrcReader.validateFile) OrcWriteValidationBuilder(com.facebook.presto.orc.OrcWriteValidation.OrcWriteValidationBuilder) IOException(java.io.IOException) DwrfStripeCacheData(com.facebook.presto.orc.metadata.DwrfStripeCacheData) Stream(com.facebook.presto.orc.metadata.Stream) Consumer(java.util.function.Consumer) EncryptionGroup(com.facebook.presto.orc.metadata.EncryptionGroup) Collectors.toList(java.util.stream.Collectors.toList) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) CompressionBufferPool(com.facebook.presto.orc.writer.CompressionBufferPool) Closeable(java.io.Closeable) DwrfProto(com.facebook.presto.orc.proto.DwrfProto) VisibleForTesting(com.google.common.annotations.VisibleForTesting) DataOutput(com.facebook.presto.common.io.DataOutput) DwrfEncryptionInfo.createNodeToGroupMap(com.facebook.presto.orc.DwrfEncryptionInfo.createNodeToGroupMap) DataOutput.createDataOutput(com.facebook.presto.common.io.DataOutput.createDataOutput) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) DataOutput(com.facebook.presto.common.io.DataOutput) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) IntStream(java.util.stream.IntStream) Stream(com.facebook.presto.orc.metadata.Stream) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) StripeStatistics(com.facebook.presto.orc.metadata.statistics.StripeStatistics) ColumnWriter(com.facebook.presto.orc.writer.ColumnWriter) DictionaryColumnWriter(com.facebook.presto.orc.writer.DictionaryColumnWriter) ColumnWriters.createColumnWriter(com.facebook.presto.orc.writer.ColumnWriters.createColumnWriter) ColumnEncoding(com.facebook.presto.orc.metadata.ColumnEncoding) StripeFooter(com.facebook.presto.orc.metadata.StripeFooter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) StripeInformation(com.facebook.presto.orc.metadata.StripeInformation)

Example 17 with StreamDataOutput

use of com.facebook.presto.orc.stream.StreamDataOutput in project presto by prestodb.

the class TestStreamLayout method testByColumnSize.

@Test
public void testByColumnSize() {
    // Assume the file has 3 streams
    // 1st Column( 1010), Data(1000), Present(10)
    // 2nd column (1010), Dictionary (300), Present (10), Data(600), Length(100)
    // 3rd Column > 2GB
    List<StreamDataOutput> streams = new ArrayList<>();
    streams.add(createStream(1, StreamKind.DATA, 1_000));
    streams.add(createStream(1, StreamKind.PRESENT, 10));
    streams.add(createStream(2, StreamKind.DICTIONARY_DATA, 300));
    streams.add(createStream(2, StreamKind.PRESENT, 10));
    streams.add(createStream(2, StreamKind.DATA, 600));
    streams.add(createStream(2, StreamKind.LENGTH, 100));
    streams.add(createStream(3, StreamKind.DATA, Integer.MAX_VALUE));
    streams.add(createStream(3, StreamKind.PRESENT, Integer.MAX_VALUE));
    Collections.shuffle(streams);
    new ByColumnSize().reorder(streams);
    Iterator<StreamDataOutput> iterator = streams.iterator();
    verifyStream(iterator.next().getStream(), 1, StreamKind.PRESENT, 10);
    verifyStream(iterator.next().getStream(), 1, StreamKind.DATA, 1000);
    verifyStream(iterator.next().getStream(), 2, StreamKind.PRESENT, 10);
    verifyStream(iterator.next().getStream(), 2, StreamKind.LENGTH, 100);
    verifyStream(iterator.next().getStream(), 2, StreamKind.DICTIONARY_DATA, 300);
    verifyStream(iterator.next().getStream(), 2, StreamKind.DATA, 600);
    verifyStream(iterator.next().getStream(), 3, StreamKind.PRESENT, Integer.MAX_VALUE);
    verifyStream(iterator.next().getStream(), 3, StreamKind.DATA, Integer.MAX_VALUE);
    assertFalse(iterator.hasNext());
}
Also used : ArrayList(java.util.ArrayList) ByColumnSize(com.facebook.presto.orc.writer.StreamLayout.ByColumnSize) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput) Test(org.testng.annotations.Test)

Example 18 with StreamDataOutput

use of com.facebook.presto.orc.stream.StreamDataOutput in project presto by prestodb.

the class TestSliceDictionaryColumnWriter method getOrcInputStream.

private OrcInputStream getOrcInputStream(List<StreamDataOutput> streams, StreamKind streamKind) throws OrcCorruptionException {
    StreamDataOutput stream = getStreamKind(streams, streamKind);
    Slice slice = convertStreamToSlice(stream);
    return convertSliceToInputStream(slice);
}
Also used : Slice(io.airlift.slice.Slice) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) StreamDataOutput(com.facebook.presto.orc.stream.StreamDataOutput)

Aggregations

StreamDataOutput (com.facebook.presto.orc.stream.StreamDataOutput)18 ArrayList (java.util.ArrayList)16 Slice (io.airlift.slice.Slice)14 Stream (com.facebook.presto.orc.metadata.Stream)13 ColumnStatistics (com.facebook.presto.orc.metadata.statistics.ColumnStatistics)13 ImmutableList (com.google.common.collect.ImmutableList)13 List (java.util.List)13 BooleanStreamCheckpoint (com.facebook.presto.orc.checkpoint.BooleanStreamCheckpoint)12 RowGroupIndex (com.facebook.presto.orc.metadata.RowGroupIndex)12 PresentOutputStream (com.facebook.presto.orc.stream.PresentOutputStream)12 LongStreamCheckpoint (com.facebook.presto.orc.checkpoint.LongStreamCheckpoint)7 LongOutputStream (com.facebook.presto.orc.stream.LongOutputStream)7 LongOutputStream.createLengthOutputStream (com.facebook.presto.orc.stream.LongOutputStream.createLengthOutputStream)3 Slices.utf8Slice (io.airlift.slice.Slices.utf8Slice)2 Collectors.toList (java.util.stream.Collectors.toList)2 Page (com.facebook.presto.common.Page)1 BlockBuilder (com.facebook.presto.common.block.BlockBuilder)1 DataOutput (com.facebook.presto.common.io.DataOutput)1 DataOutput.createDataOutput (com.facebook.presto.common.io.DataOutput.createDataOutput)1 DataSink (com.facebook.presto.common.io.DataSink)1