use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class OrcWriteValidation method validateColumnStatisticsEquivalent.
private static void validateColumnStatisticsEquivalent(OrcDataSourceId orcDataSourceId, String name, List<ColumnStatistics> actualColumnStatistics, List<ColumnStatistics> expectedColumnStatistics) throws OrcCorruptionException {
requireNonNull(name, "name is null");
requireNonNull(actualColumnStatistics, "actualColumnStatistics is null");
requireNonNull(expectedColumnStatistics, "expectedColumnStatistics is null");
if (actualColumnStatistics.size() != expectedColumnStatistics.size()) {
throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected number of columns in %s statistics", name);
}
for (int i = 0; i < actualColumnStatistics.size(); i++) {
ColumnStatistics actual = actualColumnStatistics.get(i);
ColumnStatistics expected = expectedColumnStatistics.get(i);
validateColumnStatisticsEquivalent(orcDataSourceId, name + " column " + i, actual, expected);
}
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class OrcWriter method addStatsRecursive.
private void addStatsRecursive(List<ColumnStatistics> allStats, int index, Map<Integer, List<ColumnStatistics>> nodeAndSubNodeStats, List<ColumnStatistics> unencryptedStats, Map<Integer, Map<Integer, Slice>> encryptedStats) throws IOException {
if (allStats.isEmpty()) {
return;
}
ColumnStatistics columnStatistics = allStats.get(index);
if (dwrfEncryptionInfo.getGroupByNodeId(index).isPresent()) {
int group = dwrfEncryptionInfo.getGroupByNodeId(index).get();
boolean isRootNode = dwrfWriterEncryption.get().getWriterEncryptionGroups().get(group).getNodes().contains(index);
verify(isRootNode && nodeAndSubNodeStats.isEmpty() || nodeAndSubNodeStats.size() == 1 && nodeAndSubNodeStats.get(group) != null, "nodeAndSubNodeStats should only be present for subnodes of a group");
nodeAndSubNodeStats.computeIfAbsent(group, x -> new ArrayList<>()).add(columnStatistics);
unencryptedStats.add(new ColumnStatistics(columnStatistics.getNumberOfValues(), null));
for (Integer fieldIndex : orcTypes.get(index).getFieldTypeIndexes()) {
addStatsRecursive(allStats, fieldIndex, nodeAndSubNodeStats, unencryptedStats, encryptedStats);
}
if (isRootNode) {
Slice encryptedFileStatistics = toEncryptedFileStatistics(nodeAndSubNodeStats.get(group), group);
encryptedStats.computeIfAbsent(group, x -> new HashMap<>()).put(index, encryptedFileStatistics);
}
} else {
unencryptedStats.add(columnStatistics);
for (Integer fieldIndex : orcTypes.get(index).getFieldTypeIndexes()) {
addStatsRecursive(allStats, fieldIndex, new HashMap<>(), unencryptedStats, encryptedStats);
}
}
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class OrcWriter method bufferStripeData.
/**
* Collect the data for the stripe. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<DataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
if (stripeRowCount == 0) {
return ImmutableList.of();
}
List<DataOutput> outputData = new ArrayList<>();
List<Stream> unencryptedStreams = new ArrayList<>(columnWriters.size() * 3);
Multimap<Integer, Stream> encryptedStreams = ArrayListMultimap.create();
// get index streams
long indexLength = 0;
long offset = 0;
int previousEncryptionGroup = -1;
for (ColumnWriter columnWriter : columnWriters) {
for (StreamDataOutput indexStream : columnWriter.getIndexStreams(Optional.empty())) {
// The ordering is critical because the stream only contain a length with no offset.
// if the previous stream was part of a different encryption group, need to specify an offset so we know the column order
outputData.add(indexStream);
Optional<Integer> encryptionGroup = dwrfEncryptionInfo.getGroupByNodeId(indexStream.getStream().getColumn());
if (encryptionGroup.isPresent()) {
Stream stream = previousEncryptionGroup == encryptionGroup.get() ? indexStream.getStream() : indexStream.getStream().withOffset(offset);
encryptedStreams.put(encryptionGroup.get(), stream);
previousEncryptionGroup = encryptionGroup.get();
} else {
Stream stream = previousEncryptionGroup == -1 ? indexStream.getStream() : indexStream.getStream().withOffset(offset);
unencryptedStreams.add(stream);
previousEncryptionGroup = -1;
}
offset += indexStream.size();
indexLength += indexStream.size();
}
}
if (dwrfStripeCacheWriter.isPresent()) {
dwrfStripeCacheWriter.get().addIndexStreams(ImmutableList.copyOf(outputData), indexLength);
}
// data streams (sorted by size)
long dataLength = 0;
List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
for (ColumnWriter columnWriter : columnWriters) {
List<StreamDataOutput> streams = columnWriter.getDataStreams();
dataStreams.addAll(streams);
dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
}
streamLayout.reorder(dataStreams);
// add data streams
for (StreamDataOutput dataStream : dataStreams) {
// The ordering is critical because the stream only contains a length with no offset.
// if the previous stream was part of a different encryption group, need to specify an offset so we know the column order
outputData.add(dataStream);
Optional<Integer> encryptionGroup = dwrfEncryptionInfo.getGroupByNodeId(dataStream.getStream().getColumn());
if (encryptionGroup.isPresent()) {
Stream stream = previousEncryptionGroup == encryptionGroup.get() ? dataStream.getStream() : dataStream.getStream().withOffset(offset);
encryptedStreams.put(encryptionGroup.get(), stream);
previousEncryptionGroup = encryptionGroup.get();
} else {
Stream stream = previousEncryptionGroup == -1 ? dataStream.getStream() : dataStream.getStream().withOffset(offset);
unencryptedStreams.add(stream);
previousEncryptionGroup = -1;
}
offset += dataStream.size();
}
Map<Integer, ColumnEncoding> columnEncodings = new HashMap<>();
columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
Map<Integer, ColumnStatistics> columnStatistics = new HashMap<>();
columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
// the 0th column is a struct column for the whole row
columnEncodings.put(0, new ColumnEncoding(DIRECT, 0));
columnStatistics.put(0, new ColumnStatistics((long) stripeRowCount, null));
Map<Integer, ColumnEncoding> unencryptedColumnEncodings = columnEncodings.entrySet().stream().filter(entry -> !dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).isPresent()).collect(toImmutableMap(Entry::getKey, Entry::getValue));
Map<Integer, ColumnEncoding> encryptedColumnEncodings = columnEncodings.entrySet().stream().filter(entry -> dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).isPresent()).collect(toImmutableMap(Entry::getKey, Entry::getValue));
List<Slice> encryptedGroups = createEncryptedGroups(encryptedStreams, encryptedColumnEncodings);
StripeFooter stripeFooter = new StripeFooter(unencryptedStreams, unencryptedColumnEncodings, encryptedGroups);
Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
outputData.add(createDataOutput(footer));
dwrfStripeCacheWriter.ifPresent(stripeCacheWriter -> stripeCacheWriter.addStripeFooter(createDataOutput(footer)));
// create final stripe statistics
StripeStatistics statistics = new StripeStatistics(toDenseList(columnStatistics, orcTypes.size()));
recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length(), OptionalLong.of(stripeRawSize), dwrfEncryptionInfo.getEncryptedKeyMetadatas());
ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
closedStripes.add(closedStripe);
closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
stats.recordStripeWritten(flushPolicy.getStripeMinBytes(), flushPolicy.getStripeMaxBytes(), dictionaryMaxMemoryBytes, flushReason, dictionaryCompressionOptimizer.getDictionaryMemoryBytes(), stripeInformation);
return outputData;
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class StripeReader method getRowGroupStatistics.
private static Map<Integer, ColumnStatistics> getRowGroupStatistics(OrcType rootStructType, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) {
requireNonNull(rootStructType, "rootStructType is null");
checkArgument(rootStructType.getOrcTypeKind() == STRUCT);
requireNonNull(columnIndexes, "columnIndexes is null");
checkArgument(rowGroup >= 0, "rowGroup is negative");
Map<Integer, List<ColumnStatistics>> groupedColumnStatistics = new HashMap<>();
for (Entry<StreamId, List<RowGroupIndex>> entry : columnIndexes.entrySet()) {
if (!entry.getValue().isEmpty() && entry.getValue().get(rowGroup) != null) {
groupedColumnStatistics.computeIfAbsent(entry.getKey().getColumn(), key -> new ArrayList<>()).add(entry.getValue().get(rowGroup).getColumnStatistics());
}
}
ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder();
for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) {
List<ColumnStatistics> columnStatistics = groupedColumnStatistics.get(rootStructType.getFieldTypeIndex(ordinal));
if (columnStatistics != null) {
if (columnStatistics.size() == 1) {
statistics.put(ordinal, getOnlyElement(columnStatistics));
} else {
// Merge statistics from different streams
// This can happen if map is represented as struct (DWRF only)
statistics.put(ordinal, mergeColumnStatistics(columnStatistics));
}
}
}
return statistics.build();
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class AbstractOrcRecordReader method close.
@Override
public void close() throws IOException {
try (Closer closer = Closer.create()) {
closer.register(orcDataSource);
for (StreamReader column : streamReaders) {
if (column != null) {
closer.register(column::close);
}
}
}
rowGroups = null;
if (writeChecksumBuilder.isPresent()) {
OrcWriteValidation.WriteChecksum actualChecksum = writeChecksumBuilder.get().build();
validateWrite(validation -> validation.getChecksum().getTotalRowCount() == actualChecksum.getTotalRowCount(), "Invalid row count");
List<Long> columnHashes = actualChecksum.getColumnHashes();
for (int i = 0; i < columnHashes.size(); i++) {
int columnIndex = i;
validateWrite(validation -> validation.getChecksum().getColumnHashes().get(columnIndex).equals(columnHashes.get(columnIndex)), "Invalid checksum for column %s", columnIndex);
}
validateWrite(validation -> validation.getChecksum().getStripeHash() == actualChecksum.getStripeHash(), "Invalid stripes checksum");
}
if (fileStatisticsValidation.isPresent()) {
List<ColumnStatistics> columnStatistics = fileStatisticsValidation.get().build();
writeValidation.get().validateFileStatistics(orcDataSource.getId(), columnStatistics);
}
}
Aggregations