Search in sources :

Example 1 with ColumnWriter

use of io.trino.orc.writer.ColumnWriter in project trino by trinodb.

the class OrcWriter method writeChunk.

private void writeChunk(Page chunk) throws IOException {
    if (rowGroupRowCount == 0) {
        columnWriters.forEach(ColumnWriter::beginRowGroup);
    }
    // write chunks
    bufferedBytes = 0;
    for (int channel = 0; channel < chunk.getChannelCount(); channel++) {
        ColumnWriter writer = columnWriters.get(channel);
        writer.writeBlock(chunk.getBlock(channel));
        bufferedBytes += writer.getBufferedBytes();
    }
    // update stats
    rowGroupRowCount += chunk.getPositionCount();
    checkState(rowGroupRowCount <= rowGroupMaxRowCount);
    stripeRowCount += chunk.getPositionCount();
    // record checkpoint if necessary
    if (rowGroupRowCount == rowGroupMaxRowCount) {
        finishRowGroup();
    }
    // convert dictionary encoded columns to direct if dictionary memory usage exceeded
    dictionaryCompressionOptimizer.optimize(bufferedBytes, stripeRowCount);
    // flush stripe if necessary
    bufferedBytes = toIntExact(columnWriters.stream().mapToLong(ColumnWriter::getBufferedBytes).sum());
    if (stripeRowCount == stripeMaxRowCount) {
        flushStripe(MAX_ROWS);
    } else if (bufferedBytes > stripeMaxBytes) {
        flushStripe(MAX_BYTES);
    } else if (dictionaryCompressionOptimizer.isFull(bufferedBytes)) {
        flushStripe(DICTIONARY_FULL);
    }
    columnWritersRetainedBytes = columnWriters.stream().mapToLong(ColumnWriter::getRetainedBytes).sum();
}
Also used : ColumnWriters.createColumnWriter(io.trino.orc.writer.ColumnWriters.createColumnWriter) ColumnWriter(io.trino.orc.writer.ColumnWriter) SliceDictionaryColumnWriter(io.trino.orc.writer.SliceDictionaryColumnWriter)

Example 2 with ColumnWriter

use of io.trino.orc.writer.ColumnWriter in project trino by trinodb.

the class OrcWriter method bufferStripeData.

/**
 * Collect the data for the stripe.  This is not the actual data, but
 * instead are functions that know how to write the data.
 */
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
    if (stripeRowCount == 0) {
        verify(flushReason == CLOSED, "An empty stripe is not allowed");
        // column writers must be closed or the reset call will fail
        columnWriters.forEach(ColumnWriter::close);
        return ImmutableList.of();
    }
    if (rowGroupRowCount > 0) {
        finishRowGroup();
    }
    // convert any dictionary encoded column with a low compression ratio to direct
    dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
    columnWriters.forEach(ColumnWriter::close);
    List<OrcDataOutput> outputData = new ArrayList<>();
    List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
    // get index streams
    long indexLength = 0;
    for (ColumnWriter columnWriter : columnWriters) {
        for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
            // The ordering is critical because the stream only contain a length with no offset.
            outputData.add(indexStream);
            allStreams.add(indexStream.getStream());
            indexLength += indexStream.size();
        }
        for (StreamDataOutput bloomFilter : columnWriter.getBloomFilters(metadataWriter)) {
            outputData.add(bloomFilter);
            allStreams.add(bloomFilter.getStream());
            indexLength += bloomFilter.size();
        }
    }
    // data streams (sorted by size)
    long dataLength = 0;
    List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
    for (ColumnWriter columnWriter : columnWriters) {
        List<StreamDataOutput> streams = columnWriter.getDataStreams();
        dataStreams.addAll(streams);
        dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
    }
    Collections.sort(dataStreams);
    // add data streams
    for (StreamDataOutput dataStream : dataStreams) {
        // The ordering is critical because the stream only contain a length with no offset.
        outputData.add(dataStream);
        allStreams.add(dataStream.getStream());
    }
    Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
    Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
    columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
    // the 0th column is a struct column for the whole row
    columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
    columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null, null));
    // add footer
    StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
    Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
    outputData.add(createDataOutput(footer));
    // create final stripe statistics
    StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
    recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
    StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
    ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
    closedStripes.add(closedStripe);
    closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
    recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
    stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
    return outputData;
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.trino.orc.metadata.OrcColumnId) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) StripeStatistics(io.trino.orc.metadata.statistics.StripeStatistics) ColumnWriters.createColumnWriter(io.trino.orc.writer.ColumnWriters.createColumnWriter) ColumnWriter(io.trino.orc.writer.ColumnWriter) SliceDictionaryColumnWriter(io.trino.orc.writer.SliceDictionaryColumnWriter) StreamDataOutput(io.trino.orc.stream.StreamDataOutput) OrcDataOutput(io.trino.orc.stream.OrcDataOutput) ColumnEncoding(io.trino.orc.metadata.ColumnEncoding) StripeFooter(io.trino.orc.metadata.StripeFooter) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) Slice(io.airlift.slice.Slice) Stream(io.trino.orc.metadata.Stream) StripeInformation(io.trino.orc.metadata.StripeInformation)

Aggregations

ColumnWriter (io.trino.orc.writer.ColumnWriter)2 ColumnWriters.createColumnWriter (io.trino.orc.writer.ColumnWriters.createColumnWriter)2 SliceDictionaryColumnWriter (io.trino.orc.writer.SliceDictionaryColumnWriter)2 Slice (io.airlift.slice.Slice)1 Slices.utf8Slice (io.airlift.slice.Slices.utf8Slice)1 ColumnEncoding (io.trino.orc.metadata.ColumnEncoding)1 OrcColumnId (io.trino.orc.metadata.OrcColumnId)1 Stream (io.trino.orc.metadata.Stream)1 StripeFooter (io.trino.orc.metadata.StripeFooter)1 StripeInformation (io.trino.orc.metadata.StripeInformation)1 ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)1 StripeStatistics (io.trino.orc.metadata.statistics.StripeStatistics)1 OrcDataOutput (io.trino.orc.stream.OrcDataOutput)1 StreamDataOutput (io.trino.orc.stream.StreamDataOutput)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1