use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.
the class OrcWriter method bufferStripeData.
/**
* Collect the data for for the stripe. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<DataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
if (stripeRowCount == 0) {
return ImmutableList.of();
}
List<DataOutput> outputData = new ArrayList<>();
List<Stream> unencryptedStreams = new ArrayList<>(columnWriters.size() * 3);
Multimap<Integer, Stream> encryptedStreams = ArrayListMultimap.create();
// get index streams
long indexLength = 0;
long offset = 0;
int previousEncryptionGroup = -1;
for (ColumnWriter columnWriter : columnWriters) {
for (StreamDataOutput indexStream : columnWriter.getIndexStreams()) {
// The ordering is critical because the stream only contain a length with no offset.
// if the previous stream was part of a different encryption group, need to specify an offset so we know the column order
outputData.add(indexStream);
Optional<Integer> encryptionGroup = dwrfEncryptionInfo.getGroupByNodeId(indexStream.getStream().getColumn());
if (encryptionGroup.isPresent()) {
Stream stream = previousEncryptionGroup == encryptionGroup.get() ? indexStream.getStream() : indexStream.getStream().withOffset(offset);
encryptedStreams.put(encryptionGroup.get(), stream);
previousEncryptionGroup = encryptionGroup.get();
} else {
Stream stream = previousEncryptionGroup == -1 ? indexStream.getStream() : indexStream.getStream().withOffset(offset);
unencryptedStreams.add(stream);
previousEncryptionGroup = -1;
}
offset += indexStream.size();
indexLength += indexStream.size();
}
}
if (dwrfStripeCacheWriter.isPresent()) {
dwrfStripeCacheWriter.get().addIndexStreams(ImmutableList.copyOf(outputData), indexLength);
}
// data streams (sorted by size)
long dataLength = 0;
List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
for (ColumnWriter columnWriter : columnWriters) {
List<StreamDataOutput> streams = columnWriter.getDataStreams();
dataStreams.addAll(streams);
dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
}
streamLayout.reorder(dataStreams);
// add data streams
for (StreamDataOutput dataStream : dataStreams) {
// The ordering is critical because the stream only contains a length with no offset.
// if the previous stream was part of a different encryption group, need to specify an offset so we know the column order
outputData.add(dataStream);
Optional<Integer> encryptionGroup = dwrfEncryptionInfo.getGroupByNodeId(dataStream.getStream().getColumn());
if (encryptionGroup.isPresent()) {
Stream stream = previousEncryptionGroup == encryptionGroup.get() ? dataStream.getStream() : dataStream.getStream().withOffset(offset);
encryptedStreams.put(encryptionGroup.get(), stream);
previousEncryptionGroup = encryptionGroup.get();
} else {
Stream stream = previousEncryptionGroup == -1 ? dataStream.getStream() : dataStream.getStream().withOffset(offset);
unencryptedStreams.add(stream);
previousEncryptionGroup = -1;
}
offset += dataStream.size();
}
Map<Integer, ColumnEncoding> columnEncodings = new HashMap<>();
columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
Map<Integer, ColumnStatistics> columnStatistics = new HashMap<>();
columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
// the 0th column is a struct column for the whole row
columnEncodings.put(0, new ColumnEncoding(DIRECT, 0));
columnStatistics.put(0, new ColumnStatistics((long) stripeRowCount, null));
Map<Integer, ColumnEncoding> unencryptedColumnEncodings = columnEncodings.entrySet().stream().filter(entry -> !dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).isPresent()).collect(toImmutableMap(Entry::getKey, Entry::getValue));
Map<Integer, ColumnEncoding> encryptedColumnEncodings = columnEncodings.entrySet().stream().filter(entry -> dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).isPresent()).collect(toImmutableMap(Entry::getKey, Entry::getValue));
List<Slice> encryptedGroups = createEncryptedGroups(encryptedStreams, encryptedColumnEncodings);
StripeFooter stripeFooter = new StripeFooter(unencryptedStreams, unencryptedColumnEncodings, encryptedGroups);
Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
outputData.add(createDataOutput(footer));
dwrfStripeCacheWriter.ifPresent(stripeCacheWriter -> stripeCacheWriter.addStripeFooter(createDataOutput(footer)));
// create final stripe statistics
StripeStatistics statistics = new StripeStatistics(toDenseList(columnStatistics, orcTypes.size()));
recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length(), OptionalLong.of(stripeRawSize), dwrfEncryptionInfo.getEncryptedKeyMetadatas());
ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
closedStripes.add(closedStripe);
closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
stats.recordStripeWritten(flushPolicy.getStripeMinBytes(), flushPolicy.getStripeMaxBytes(), dictionaryMaxMemoryBytes, flushReason, dictionaryCompressionOptimizer.getDictionaryMemoryBytes(), stripeInformation);
return outputData;
}
use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.
the class StripeReader method addIncludedStreams.
/**
* Add streams that are in includedOrcColumns to the includedStreams map,
* and return whether there were any rowGroupDictionaries
*/
private boolean addIncludedStreams(Map<Integer, ColumnEncoding> columnEncodings, List<Stream> streams, Map<StreamId, Stream> includedStreams) {
boolean hasRowGroupDictionary = false;
for (Stream stream : streams) {
if (includedOrcColumns.contains(stream.getColumn())) {
includedStreams.put(new StreamId(stream), stream);
if (stream.getStreamKind() == StreamKind.IN_DICTIONARY) {
ColumnEncoding columnEncoding = columnEncodings.get(stream.getColumn());
if (columnEncoding.getColumnEncodingKind() == DICTIONARY) {
hasRowGroupDictionary = true;
}
Optional<SortedMap<Integer, DwrfSequenceEncoding>> additionalSequenceEncodings = columnEncoding.getAdditionalSequenceEncodings();
if (additionalSequenceEncodings.isPresent() && additionalSequenceEncodings.get().values().stream().map(DwrfSequenceEncoding::getValueEncoding).anyMatch(encoding -> encoding.getColumnEncodingKind() == DICTIONARY)) {
hasRowGroupDictionary = true;
}
}
}
}
return hasRowGroupDictionary;
}
use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.
the class MapFlatSelectiveStreamReader method startStripe.
@Override
public void startStripe(Stripe stripe) throws IOException {
presentStreamSource = missingStreamSource(BooleanInputStream.class);
inMapStreamSources.clear();
valueStreamDescriptors.clear();
valueStreamReaders.clear();
ColumnEncoding encoding = stripe.getColumnEncodings().get(baseValueStreamDescriptor.getStreamId());
SortedMap<Integer, DwrfSequenceEncoding> additionalSequenceEncodings = Collections.emptySortedMap();
// encoding or encoding.getAdditionalSequenceEncodings() may not be present when every map is empty or null
if (encoding != null && encoding.getAdditionalSequenceEncodings().isPresent()) {
additionalSequenceEncodings = encoding.getAdditionalSequenceEncodings().get();
}
keyIndices = ensureCapacity(keyIndices, additionalSequenceEncodings.size());
keyCount = 0;
// The ColumnEncoding with sequence ID 0 doesn't have any data associated with it
int keyIndex = 0;
for (Map.Entry<Integer, DwrfSequenceEncoding> entry : additionalSequenceEncodings.entrySet()) {
if (!isRequiredKey(entry.getValue())) {
keyIndex++;
continue;
}
keyIndices[keyCount] = keyIndex;
keyCount++;
keyIndex++;
int sequence = entry.getKey();
inMapStreamSources.add(missingStreamSource(BooleanInputStream.class));
StreamDescriptor valueStreamDescriptor = copyStreamDescriptorWithSequence(baseValueStreamDescriptor, sequence);
valueStreamDescriptors.add(valueStreamDescriptor);
SelectiveStreamReader valueStreamReader = SelectiveStreamReaders.createStreamReader(valueStreamDescriptor, ImmutableBiMap.of(), Optional.ofNullable(outputType).map(MapType::getValueType), ImmutableList.of(), hiveStorageTimeZone, options, legacyMapSubscript, systemMemoryContext.newOrcAggregatedMemoryContext());
valueStreamReader.startStripe(stripe);
valueStreamReaders.add(valueStreamReader);
}
keyBlock = getKeysBlock(ImmutableList.copyOf(additionalSequenceEncodings.values()));
readOffset = 0;
presentStream = null;
rowGroupOpen = false;
}
Aggregations