use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.
the class OrcWriter method bufferStripeData.
/**
* Collect the data for the stripe. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
if (stripeRowCount == 0) {
verify(flushReason == CLOSED, "An empty stripe is not allowed");
// column writers must be closed or the reset call will fail
columnWriters.forEach(ColumnWriter::close);
return ImmutableList.of();
}
if (rowGroupRowCount > 0) {
finishRowGroup();
}
// convert any dictionary encoded column with a low compression ratio to direct
dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
columnWriters.forEach(ColumnWriter::close);
List<OrcDataOutput> outputData = new ArrayList<>();
List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
// get index streams
long indexLength = 0;
for (ColumnWriter columnWriter : columnWriters) {
for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
// The ordering is critical because the stream only contain a length with no offset.
outputData.add(indexStream);
allStreams.add(indexStream.getStream());
indexLength += indexStream.size();
}
for (StreamDataOutput bloomFilter : columnWriter.getBloomFilters(metadataWriter)) {
outputData.add(bloomFilter);
allStreams.add(bloomFilter.getStream());
indexLength += bloomFilter.size();
}
}
// data streams (sorted by size)
long dataLength = 0;
List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
for (ColumnWriter columnWriter : columnWriters) {
List<StreamDataOutput> streams = columnWriter.getDataStreams();
dataStreams.addAll(streams);
dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
}
Collections.sort(dataStreams);
// add data streams
for (StreamDataOutput dataStream : dataStreams) {
// The ordering is critical because the stream only contain a length with no offset.
outputData.add(dataStream);
allStreams.add(dataStream.getStream());
}
Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
// the 0th column is a struct column for the whole row
columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null, null));
// add footer
StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
outputData.add(createDataOutput(footer));
// create final stripe statistics
StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
closedStripes.add(closedStripe);
closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
return outputData;
}
use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.
the class OrcWriter method toFileStats.
private static Optional<ColumnMetadata<ColumnStatistics>> toFileStats(List<ColumnMetadata<ColumnStatistics>> stripes) {
if (stripes.isEmpty()) {
return Optional.empty();
}
int columnCount = stripes.get(0).size();
checkArgument(stripes.stream().allMatch(stripe -> columnCount == stripe.size()));
ImmutableList.Builder<ColumnStatistics> fileStats = ImmutableList.builder();
for (int i = 0; i < columnCount; i++) {
OrcColumnId columnId = new OrcColumnId(i);
fileStats.add(ColumnStatistics.mergeColumnStatistics(stripes.stream().map(stripe -> stripe.get(columnId)).collect(toList())));
}
return Optional.of(new ColumnMetadata<>(fileStats.build()));
}
use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.
the class OrcWriter method bufferFileFooter.
/**
* Collect the data for the file footer. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<OrcDataOutput> bufferFileFooter() throws IOException {
List<OrcDataOutput> outputData = new ArrayList<>();
Metadata metadata = new Metadata(closedStripes.stream().map(ClosedStripe::getStatistics).map(Optional::of).collect(toList()));
Slice metadataSlice = metadataWriter.writeMetadata(metadata);
outputData.add(createDataOutput(metadataSlice));
fileStats = toFileStats(closedStripes.stream().map(ClosedStripe::getStatistics).map(StripeStatistics::getColumnStatistics).collect(toList()));
fileStatsRetainedBytes = fileStats.map(stats -> stats.stream().mapToLong(ColumnStatistics::getRetainedSizeInBytes).sum()).orElse(0L);
recordValidation(validation -> validation.setFileStatistics(fileStats));
Map<String, Slice> userMetadata = this.userMetadata.entrySet().stream().collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue())));
Footer footer = new Footer(fileRowCount, rowGroupMaxRowCount == 0 ? OptionalInt.empty() : OptionalInt.of(rowGroupMaxRowCount), closedStripes.stream().map(ClosedStripe::getStripeInformation).collect(toImmutableList()), orcTypes, fileStats, userMetadata, // writer id will be set by MetadataWriter
Optional.empty());
closedStripes.clear();
closedStripesRetainedBytes = 0;
Slice footerSlice = metadataWriter.writeFooter(footer);
outputData.add(createDataOutput(footerSlice));
recordValidation(validation -> validation.setVersion(metadataWriter.getOrcMetadataVersion()));
Slice postscriptSlice = metadataWriter.writePostscript(footerSlice.length(), metadataSlice.length(), compression, maxCompressionBufferSize);
outputData.add(createDataOutput(postscriptSlice));
outputData.add(createDataOutput(Slices.wrappedBuffer(UnsignedBytes.checkedCast(postscriptSlice.length()))));
return outputData;
}
use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.
the class StripeReader method readColumnIndexes.
private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData, Map<OrcColumnId, List<BloomFilter>> bloomFilterIndexes) throws IOException {
ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
Stream stream = entry.getValue();
if (stream.getStreamKind() == ROW_INDEX) {
OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
List<BloomFilter> bloomFilters = bloomFilterIndexes.get(entry.getKey().getColumnId());
List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
if (bloomFilters != null && !bloomFilters.isEmpty()) {
ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder();
for (int i = 0; i < rowGroupIndexes.size(); i++) {
RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i);
ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics().withBloomFilter(bloomFilters.get(i));
newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics));
}
rowGroupIndexes = newRowGroupIndexes.build();
}
columnIndexes.put(entry.getKey(), rowGroupIndexes);
}
}
return columnIndexes.buildOrThrow();
}
use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.
the class StripeReader method selectRowGroups.
private Set<Integer> selectRowGroups(StripeInformation stripe, Map<StreamId, List<RowGroupIndex>> columnIndexes) {
int rowsInRowGroup = this.rowsInRowGroup.orElseThrow(() -> new IllegalStateException("Cannot create row groups if row group info is missing"));
int rowsInStripe = stripe.getNumberOfRows();
int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup);
ImmutableSet.Builder<Integer> selectedRowGroups = ImmutableSet.builder();
int remainingRows = rowsInStripe;
for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) {
int rows = Math.min(remainingRows, rowsInRowGroup);
ColumnMetadata<ColumnStatistics> statistics = getRowGroupStatistics(types, columnIndexes, rowGroup);
if (predicate.matches(rows, statistics)) {
selectedRowGroups.add(rowGroup);
}
remainingRows -= rows;
}
return selectedRowGroups.build();
}
Aggregations