use of com.facebook.presto.orc.metadata.statistics.StripeStatistics in project presto by prestodb.
the class OrcWriter method bufferFileFooter.
/**
* Collect the data for for the file footer. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<DataOutput> bufferFileFooter() throws IOException {
List<DataOutput> outputData = new ArrayList<>();
Metadata metadata = new Metadata(closedStripes.stream().map(ClosedStripe::getStatistics).collect(toList()));
Slice metadataSlice = metadataWriter.writeMetadata(metadata);
outputData.add(createDataOutput(metadataSlice));
numberOfRows = closedStripes.stream().mapToLong(stripe -> stripe.getStripeInformation().getNumberOfRows()).sum();
List<ColumnStatistics> fileStats = toFileStats(closedStripes.stream().map(ClosedStripe::getStatistics).map(StripeStatistics::getColumnStatistics).collect(toList()));
recordValidation(validation -> validation.setFileStatistics(fileStats));
Map<String, Slice> userMetadata = this.userMetadata.entrySet().stream().collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue())));
unencryptedStats = new ArrayList<>();
Map<Integer, Map<Integer, Slice>> encryptedStats = new HashMap<>();
addStatsRecursive(fileStats, 0, new HashMap<>(), unencryptedStats, encryptedStats);
Optional<DwrfEncryption> dwrfEncryption;
if (dwrfWriterEncryption.isPresent()) {
ImmutableList.Builder<EncryptionGroup> encryptionGroupBuilder = ImmutableList.builder();
List<WriterEncryptionGroup> writerEncryptionGroups = dwrfWriterEncryption.get().getWriterEncryptionGroups();
for (int i = 0; i < writerEncryptionGroups.size(); i++) {
WriterEncryptionGroup group = writerEncryptionGroups.get(i);
Map<Integer, Slice> groupStats = encryptedStats.get(i);
encryptionGroupBuilder.add(new EncryptionGroup(group.getNodes(), // reader will just use key metadata from the stripe
Optional.empty(), group.getNodes().stream().map(groupStats::get).collect(toList())));
}
dwrfEncryption = Optional.of(new DwrfEncryption(dwrfWriterEncryption.get().getKeyProvider(), encryptionGroupBuilder.build()));
} else {
dwrfEncryption = Optional.empty();
}
Optional<DwrfStripeCacheData> dwrfStripeCacheData = dwrfStripeCacheWriter.map(DwrfStripeCacheWriter::getDwrfStripeCacheData);
Slice dwrfStripeCacheSlice = metadataWriter.writeDwrfStripeCache(dwrfStripeCacheData);
outputData.add(createDataOutput(dwrfStripeCacheSlice));
Optional<List<Integer>> dwrfStripeCacheOffsets = dwrfStripeCacheWriter.map(DwrfStripeCacheWriter::getOffsets);
Footer footer = new Footer(numberOfRows, rowGroupMaxRowCount, OptionalLong.of(rawSize), closedStripes.stream().map(ClosedStripe::getStripeInformation).collect(toList()), orcTypes, ImmutableList.copyOf(unencryptedStats), userMetadata, dwrfEncryption, dwrfStripeCacheOffsets);
closedStripes.clear();
closedStripesRetainedBytes = 0;
Slice footerSlice = metadataWriter.writeFooter(footer);
outputData.add(createDataOutput(footerSlice));
recordValidation(validation -> validation.setVersion(metadataWriter.getOrcMetadataVersion()));
Slice postscriptSlice = metadataWriter.writePostscript(footerSlice.length(), metadataSlice.length(), columnWriterOptions.getCompressionKind(), columnWriterOptions.getCompressionMaxBufferSize(), dwrfStripeCacheData);
outputData.add(createDataOutput(postscriptSlice));
outputData.add(createDataOutput(Slices.wrappedBuffer((byte) postscriptSlice.length())));
return outputData;
}
use of com.facebook.presto.orc.metadata.statistics.StripeStatistics in project presto by prestodb.
the class OrcWriteValidation method validateStripeStatistics.
public void validateStripeStatistics(OrcDataSourceId orcDataSourceId, List<StripeInformation> actualStripes, List<StripeStatistics> actualStripeStatistics) throws OrcCorruptionException {
requireNonNull(actualStripes, "actualStripes is null");
requireNonNull(actualStripeStatistics, "actualStripeStatistics is null");
if (actualStripeStatistics.isEmpty()) {
// DWRF does not have stripe statistics
return;
}
if (actualStripeStatistics.size() != stripeStatistics.size()) {
throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected number of columns in stripe statistics");
}
for (int stripeIndex = 0; stripeIndex < actualStripes.size(); stripeIndex++) {
long stripeOffset = actualStripes.get(stripeIndex).getOffset();
StripeStatistics actual = actualStripeStatistics.get(stripeIndex);
validateStripeStatistics(orcDataSourceId, stripeOffset, actual.getColumnStatistics());
}
}
use of com.facebook.presto.orc.metadata.statistics.StripeStatistics in project presto by prestodb.
the class OrcWriteValidation method validateStripeStatistics.
public void validateStripeStatistics(OrcDataSourceId orcDataSourceId, long stripeOffset, List<ColumnStatistics> actual) throws OrcCorruptionException {
StripeStatistics expected = stripeStatistics.get(stripeOffset);
if (expected == null) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected stripe at offset %s", stripeOffset);
}
validateColumnStatisticsEquivalent(orcDataSourceId, "Stripe at " + stripeOffset, actual, expected.getColumnStatistics());
}
use of com.facebook.presto.orc.metadata.statistics.StripeStatistics in project presto by prestodb.
the class OrcWriter method bufferStripeData.
/**
* Collect the data for for the stripe. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<DataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
if (stripeRowCount == 0) {
return ImmutableList.of();
}
List<DataOutput> outputData = new ArrayList<>();
List<Stream> unencryptedStreams = new ArrayList<>(columnWriters.size() * 3);
Multimap<Integer, Stream> encryptedStreams = ArrayListMultimap.create();
// get index streams
long indexLength = 0;
long offset = 0;
int previousEncryptionGroup = -1;
for (ColumnWriter columnWriter : columnWriters) {
for (StreamDataOutput indexStream : columnWriter.getIndexStreams()) {
// The ordering is critical because the stream only contain a length with no offset.
// if the previous stream was part of a different encryption group, need to specify an offset so we know the column order
outputData.add(indexStream);
Optional<Integer> encryptionGroup = dwrfEncryptionInfo.getGroupByNodeId(indexStream.getStream().getColumn());
if (encryptionGroup.isPresent()) {
Stream stream = previousEncryptionGroup == encryptionGroup.get() ? indexStream.getStream() : indexStream.getStream().withOffset(offset);
encryptedStreams.put(encryptionGroup.get(), stream);
previousEncryptionGroup = encryptionGroup.get();
} else {
Stream stream = previousEncryptionGroup == -1 ? indexStream.getStream() : indexStream.getStream().withOffset(offset);
unencryptedStreams.add(stream);
previousEncryptionGroup = -1;
}
offset += indexStream.size();
indexLength += indexStream.size();
}
}
if (dwrfStripeCacheWriter.isPresent()) {
dwrfStripeCacheWriter.get().addIndexStreams(ImmutableList.copyOf(outputData), indexLength);
}
// data streams (sorted by size)
long dataLength = 0;
List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
for (ColumnWriter columnWriter : columnWriters) {
List<StreamDataOutput> streams = columnWriter.getDataStreams();
dataStreams.addAll(streams);
dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
}
streamLayout.reorder(dataStreams);
// add data streams
for (StreamDataOutput dataStream : dataStreams) {
// The ordering is critical because the stream only contains a length with no offset.
// if the previous stream was part of a different encryption group, need to specify an offset so we know the column order
outputData.add(dataStream);
Optional<Integer> encryptionGroup = dwrfEncryptionInfo.getGroupByNodeId(dataStream.getStream().getColumn());
if (encryptionGroup.isPresent()) {
Stream stream = previousEncryptionGroup == encryptionGroup.get() ? dataStream.getStream() : dataStream.getStream().withOffset(offset);
encryptedStreams.put(encryptionGroup.get(), stream);
previousEncryptionGroup = encryptionGroup.get();
} else {
Stream stream = previousEncryptionGroup == -1 ? dataStream.getStream() : dataStream.getStream().withOffset(offset);
unencryptedStreams.add(stream);
previousEncryptionGroup = -1;
}
offset += dataStream.size();
}
Map<Integer, ColumnEncoding> columnEncodings = new HashMap<>();
columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
Map<Integer, ColumnStatistics> columnStatistics = new HashMap<>();
columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
// the 0th column is a struct column for the whole row
columnEncodings.put(0, new ColumnEncoding(DIRECT, 0));
columnStatistics.put(0, new ColumnStatistics((long) stripeRowCount, null));
Map<Integer, ColumnEncoding> unencryptedColumnEncodings = columnEncodings.entrySet().stream().filter(entry -> !dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).isPresent()).collect(toImmutableMap(Entry::getKey, Entry::getValue));
Map<Integer, ColumnEncoding> encryptedColumnEncodings = columnEncodings.entrySet().stream().filter(entry -> dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).isPresent()).collect(toImmutableMap(Entry::getKey, Entry::getValue));
List<Slice> encryptedGroups = createEncryptedGroups(encryptedStreams, encryptedColumnEncodings);
StripeFooter stripeFooter = new StripeFooter(unencryptedStreams, unencryptedColumnEncodings, encryptedGroups);
Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
outputData.add(createDataOutput(footer));
dwrfStripeCacheWriter.ifPresent(stripeCacheWriter -> stripeCacheWriter.addStripeFooter(createDataOutput(footer)));
// create final stripe statistics
StripeStatistics statistics = new StripeStatistics(toDenseList(columnStatistics, orcTypes.size()));
recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length(), OptionalLong.of(stripeRawSize), dwrfEncryptionInfo.getEncryptedKeyMetadatas());
ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
closedStripes.add(closedStripe);
closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
stats.recordStripeWritten(flushPolicy.getStripeMinBytes(), flushPolicy.getStripeMaxBytes(), dictionaryMaxMemoryBytes, flushReason, dictionaryCompressionOptimizer.getDictionaryMemoryBytes(), stripeInformation);
return outputData;
}
Aggregations