use of io.trino.orc.writer.ColumnWriter in project trino by trinodb.
the class OrcWriter method writeChunk.
private void writeChunk(Page chunk) throws IOException {
if (rowGroupRowCount == 0) {
columnWriters.forEach(ColumnWriter::beginRowGroup);
}
// write chunks
bufferedBytes = 0;
for (int channel = 0; channel < chunk.getChannelCount(); channel++) {
ColumnWriter writer = columnWriters.get(channel);
writer.writeBlock(chunk.getBlock(channel));
bufferedBytes += writer.getBufferedBytes();
}
// update stats
rowGroupRowCount += chunk.getPositionCount();
checkState(rowGroupRowCount <= rowGroupMaxRowCount);
stripeRowCount += chunk.getPositionCount();
// record checkpoint if necessary
if (rowGroupRowCount == rowGroupMaxRowCount) {
finishRowGroup();
}
// convert dictionary encoded columns to direct if dictionary memory usage exceeded
dictionaryCompressionOptimizer.optimize(bufferedBytes, stripeRowCount);
// flush stripe if necessary
bufferedBytes = toIntExact(columnWriters.stream().mapToLong(ColumnWriter::getBufferedBytes).sum());
if (stripeRowCount == stripeMaxRowCount) {
flushStripe(MAX_ROWS);
} else if (bufferedBytes > stripeMaxBytes) {
flushStripe(MAX_BYTES);
} else if (dictionaryCompressionOptimizer.isFull(bufferedBytes)) {
flushStripe(DICTIONARY_FULL);
}
columnWritersRetainedBytes = columnWriters.stream().mapToLong(ColumnWriter::getRetainedBytes).sum();
}
use of io.trino.orc.writer.ColumnWriter in project trino by trinodb.
the class OrcWriter method bufferStripeData.
/**
* Collect the data for the stripe. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
if (stripeRowCount == 0) {
verify(flushReason == CLOSED, "An empty stripe is not allowed");
// column writers must be closed or the reset call will fail
columnWriters.forEach(ColumnWriter::close);
return ImmutableList.of();
}
if (rowGroupRowCount > 0) {
finishRowGroup();
}
// convert any dictionary encoded column with a low compression ratio to direct
dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
columnWriters.forEach(ColumnWriter::close);
List<OrcDataOutput> outputData = new ArrayList<>();
List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
// get index streams
long indexLength = 0;
for (ColumnWriter columnWriter : columnWriters) {
for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
// The ordering is critical because the stream only contain a length with no offset.
outputData.add(indexStream);
allStreams.add(indexStream.getStream());
indexLength += indexStream.size();
}
for (StreamDataOutput bloomFilter : columnWriter.getBloomFilters(metadataWriter)) {
outputData.add(bloomFilter);
allStreams.add(bloomFilter.getStream());
indexLength += bloomFilter.size();
}
}
// data streams (sorted by size)
long dataLength = 0;
List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
for (ColumnWriter columnWriter : columnWriters) {
List<StreamDataOutput> streams = columnWriter.getDataStreams();
dataStreams.addAll(streams);
dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
}
Collections.sort(dataStreams);
// add data streams
for (StreamDataOutput dataStream : dataStreams) {
// The ordering is critical because the stream only contain a length with no offset.
outputData.add(dataStream);
allStreams.add(dataStream.getStream());
}
Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
// the 0th column is a struct column for the whole row
columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null, null));
// add footer
StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
outputData.add(createDataOutput(footer));
// create final stripe statistics
StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
closedStripes.add(closedStripe);
closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
return outputData;
}
Aggregations