use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class OrcWriteValidation method validateRowGroupStatistics.
public void validateRowGroupStatistics(OrcDataSourceId orcDataSourceId, long stripeOffset, Map<StreamId, List<RowGroupIndex>> actualRowGroupStatistics) throws OrcCorruptionException {
requireNonNull(actualRowGroupStatistics, "actualRowGroupStatistics is null");
List<RowGroupStatistics> expectedRowGroupStatistics = rowGroupStatistics.get(stripeOffset);
if (expectedRowGroupStatistics == null) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected stripe at offset %s", stripeOffset);
}
int rowGroupCount = expectedRowGroupStatistics.size();
for (Entry<StreamId, List<RowGroupIndex>> entry : actualRowGroupStatistics.entrySet()) {
// TODO: Remove once the Presto writer supports flat map
if (entry.getKey().getSequence() > 0) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected sequence ID for column %s at offset %s", entry.getKey().getColumn(), stripeOffset);
}
if (entry.getValue().size() != rowGroupCount) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected row group count stripe in at offset %s", stripeOffset);
}
}
for (int rowGroupIndex = 0; rowGroupIndex < expectedRowGroupStatistics.size(); rowGroupIndex++) {
RowGroupStatistics expectedRowGroup = expectedRowGroupStatistics.get(rowGroupIndex);
if (expectedRowGroup.getValidationMode() != HASHED) {
Map<Integer, ColumnStatistics> expectedStatistics = expectedRowGroup.getColumnStatistics();
Set<Integer> actualColumns = actualRowGroupStatistics.keySet().stream().map(StreamId::getColumn).collect(Collectors.toSet());
if (!expectedStatistics.keySet().equals(actualColumns)) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected column in row group %s in stripe at offset %s", rowGroupIndex, stripeOffset);
}
for (Entry<StreamId, List<RowGroupIndex>> entry : actualRowGroupStatistics.entrySet()) {
ColumnStatistics actual = entry.getValue().get(rowGroupIndex).getColumnStatistics();
ColumnStatistics expected = expectedStatistics.get(entry.getKey().getColumn());
validateColumnStatisticsEquivalent(orcDataSourceId, "Row group " + rowGroupIndex + " in stripe at offset " + stripeOffset, actual, expected);
}
}
if (expectedRowGroup.getValidationMode() != DETAILED) {
RowGroupStatistics actualRowGroup = buildActualRowGroupStatistics(rowGroupIndex, actualRowGroupStatistics);
if (expectedRowGroup.getHash() != actualRowGroup.getHash()) {
throw new OrcCorruptionException(orcDataSourceId, "Checksum mismatch for row group %s in stripe at offset %s", rowGroupIndex, stripeOffset);
}
}
}
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class OrcWriter method bufferFileFooter.
/**
* Collect the data for for the file footer. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<DataOutput> bufferFileFooter() throws IOException {
List<DataOutput> outputData = new ArrayList<>();
Metadata metadata = new Metadata(closedStripes.stream().map(ClosedStripe::getStatistics).collect(toList()));
Slice metadataSlice = metadataWriter.writeMetadata(metadata);
outputData.add(createDataOutput(metadataSlice));
numberOfRows = closedStripes.stream().mapToLong(stripe -> stripe.getStripeInformation().getNumberOfRows()).sum();
List<ColumnStatistics> fileStats = toFileStats(closedStripes.stream().map(ClosedStripe::getStatistics).map(StripeStatistics::getColumnStatistics).collect(toList()));
recordValidation(validation -> validation.setFileStatistics(fileStats));
Map<String, Slice> userMetadata = this.userMetadata.entrySet().stream().collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue())));
unencryptedStats = new ArrayList<>();
Map<Integer, Map<Integer, Slice>> encryptedStats = new HashMap<>();
addStatsRecursive(fileStats, 0, new HashMap<>(), unencryptedStats, encryptedStats);
Optional<DwrfEncryption> dwrfEncryption;
if (dwrfWriterEncryption.isPresent()) {
ImmutableList.Builder<EncryptionGroup> encryptionGroupBuilder = ImmutableList.builder();
List<WriterEncryptionGroup> writerEncryptionGroups = dwrfWriterEncryption.get().getWriterEncryptionGroups();
for (int i = 0; i < writerEncryptionGroups.size(); i++) {
WriterEncryptionGroup group = writerEncryptionGroups.get(i);
Map<Integer, Slice> groupStats = encryptedStats.get(i);
encryptionGroupBuilder.add(new EncryptionGroup(group.getNodes(), // reader will just use key metadata from the stripe
Optional.empty(), group.getNodes().stream().map(groupStats::get).collect(toList())));
}
dwrfEncryption = Optional.of(new DwrfEncryption(dwrfWriterEncryption.get().getKeyProvider(), encryptionGroupBuilder.build()));
} else {
dwrfEncryption = Optional.empty();
}
Optional<DwrfStripeCacheData> dwrfStripeCacheData = dwrfStripeCacheWriter.map(DwrfStripeCacheWriter::getDwrfStripeCacheData);
Slice dwrfStripeCacheSlice = metadataWriter.writeDwrfStripeCache(dwrfStripeCacheData);
outputData.add(createDataOutput(dwrfStripeCacheSlice));
Optional<List<Integer>> dwrfStripeCacheOffsets = dwrfStripeCacheWriter.map(DwrfStripeCacheWriter::getOffsets);
Footer footer = new Footer(numberOfRows, rowGroupMaxRowCount, OptionalLong.of(rawSize), closedStripes.stream().map(ClosedStripe::getStripeInformation).collect(toList()), orcTypes, ImmutableList.copyOf(unencryptedStats), userMetadata, dwrfEncryption, dwrfStripeCacheOffsets);
closedStripes.clear();
closedStripesRetainedBytes = 0;
Slice footerSlice = metadataWriter.writeFooter(footer);
outputData.add(createDataOutput(footerSlice));
recordValidation(validation -> validation.setVersion(metadataWriter.getOrcMetadataVersion()));
Slice postscriptSlice = metadataWriter.writePostscript(footerSlice.length(), metadataSlice.length(), columnWriterOptions.getCompressionKind(), columnWriterOptions.getCompressionMaxBufferSize(), dwrfStripeCacheData);
outputData.add(createDataOutput(postscriptSlice));
outputData.add(createDataOutput(Slices.wrappedBuffer((byte) postscriptSlice.length())));
return outputData;
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class TupleDomainOrcPredicate method matches.
@Override
public boolean matches(long numberOfRows, Map<Integer, ColumnStatistics> statisticsByColumnIndex) {
Optional<Map<C, Domain>> optionalEffectivePredicateDomains = effectivePredicate.getDomains();
if (!optionalEffectivePredicateDomains.isPresent()) {
// effective predicate is none, so skip this section
return false;
}
Map<C, Domain> effectivePredicateDomains = optionalEffectivePredicateDomains.get();
for (ColumnReference<C> columnReference : columnReferences) {
Domain predicateDomain = effectivePredicateDomains.get(columnReference.getColumn());
if (predicateDomain == null) {
// no predicate on this column, so we can't exclude this section
continue;
}
ColumnStatistics columnStatistics = statisticsByColumnIndex.get(columnReference.getOrdinal());
if (columnStatistics == null) {
// no statistics for this column, so we can't exclude this section
continue;
}
if (!columnOverlaps(columnReference, predicateDomain, numberOfRows, columnStatistics)) {
return false;
}
}
// this section was not excluded
return true;
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class StripeReader method selectRowGroups.
private Set<Integer> selectRowGroups(StripeInformation stripe, Map<StreamId, List<RowGroupIndex>> columnIndexes) {
long rowsInStripe = stripe.getNumberOfRows();
int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup);
ImmutableSet.Builder<Integer> selectedRowGroups = ImmutableSet.builder();
long remainingRows = rowsInStripe;
for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) {
int rows = toIntExact(Math.min(remainingRows, rowsInRowGroup));
Map<Integer, ColumnStatistics> statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup);
if (predicate.matches(rows, statistics)) {
selectedRowGroups.add(rowGroup);
}
remainingRows -= rows;
}
return selectedRowGroups.build();
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project presto by prestodb.
the class FloatColumnWriter method getIndexStreams.
@Override
public List<StreamDataOutput> getIndexStreams() throws IOException {
checkState(closed);
ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
List<FloatStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints();
Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
int groupId = i;
ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
FloatStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId);
Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
List<Integer> positions = createFloatColumnPositionList(compressed, dataCheckpoint, presentCheckpoint);
rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
}
Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
Stream stream = new Stream(column, StreamKind.ROW_INDEX, slice.length(), false);
return ImmutableList.of(new StreamDataOutput(slice, stream));
}
Aggregations