use of io.trino.orc.stream.InputStreamSources in project trino by trinodb.
the class StripeReader method readStripe.
public Stripe readStripe(StripeInformation stripe, AggregatedMemoryContext memoryUsage) throws IOException {
// read the stripe footer
StripeFooter stripeFooter = readStripeFooter(stripe, memoryUsage);
ColumnMetadata<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings();
if (writeValidation.isPresent()) {
writeValidation.get().validateTimeZone(orcDataSource.getId(), stripeFooter.getTimeZone());
}
ZoneId fileTimeZone = stripeFooter.getTimeZone();
// get streams for selected columns
Map<StreamId, Stream> streams = new HashMap<>();
for (Stream stream : stripeFooter.getStreams()) {
if (includedOrcColumnIds.contains(stream.getColumnId()) && isSupportedStreamType(stream, types.get(stream.getColumnId()).getOrcTypeKind())) {
streams.put(new StreamId(stream), stream);
}
}
// handle stripes with more than one row group
boolean invalidCheckPoint = false;
if (rowsInRowGroup.isPresent() && stripe.getNumberOfRows() > rowsInRowGroup.getAsInt()) {
// determine ranges of the stripe to read
Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams());
diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet()));
// read the file regions
Map<StreamId, OrcChunkLoader> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, memoryUsage);
// read the bloom filter for each column
Map<OrcColumnId, List<BloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData);
// read the row index for each column
Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes);
if (writeValidation.isPresent()) {
writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes);
}
// select the row groups matching the tuple domain
Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
// if all row groups are skipped, return null
if (selectedRowGroups.isEmpty()) {
// set accounted memory usage to zero
memoryUsage.close();
return null;
}
// value streams
Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row groups
try {
List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), streams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
return new Stripe(stripe.getNumberOfRows(), fileTimeZone, columnEncodings, rowGroups, dictionaryStreamSources);
} catch (InvalidCheckpointException e) {
// The ORC file contains a corrupt checkpoint stream treat the stripe as a single row group.
invalidCheckPoint = true;
}
}
// stripe only has one row group
ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) {
StreamId streamId = entry.getKey();
if (streams.containsKey(streamId)) {
diskRangesBuilder.put(entry);
}
}
ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.buildOrThrow();
// read the file regions
Map<StreamId, OrcChunkLoader> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, memoryUsage);
long minAverageRowBytes = 0;
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
if (entry.getKey().getStreamKind() == ROW_INDEX) {
List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, new OrcInputStream(streamsData.get(entry.getKey())));
checkState(rowGroupIndexes.size() == 1 || invalidCheckPoint, "expect a single row group or an invalid check point");
long totalBytes = 0;
long totalRows = 0;
for (RowGroupIndex rowGroupIndex : rowGroupIndexes) {
ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics();
if (columnStatistics.hasMinAverageValueSizeInBytes()) {
totalBytes += columnStatistics.getMinAverageValueSizeInBytes() * columnStatistics.getNumberOfValues();
totalRows += columnStatistics.getNumberOfValues();
}
}
if (totalRows > 0) {
minAverageRowBytes += totalBytes / totalRows;
}
}
}
// value streams
Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row group
ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
for (Entry<StreamId, ValueInputStream<?>> entry : valueStreams.entrySet()) {
builder.put(entry.getKey(), new ValueInputStreamSource<>(entry.getValue()));
}
RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), minAverageRowBytes, new InputStreamSources(builder.buildOrThrow()));
return new Stripe(stripe.getNumberOfRows(), fileTimeZone, columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
use of io.trino.orc.stream.InputStreamSources in project trino by trinodb.
the class OrcRecordReader method advanceToNextStripe.
private void advanceToNextStripe() throws IOException {
currentStripeMemoryContext.close();
currentStripeMemoryContext = memoryUsage.newAggregatedMemoryContext();
rowGroups = ImmutableList.<RowGroup>of().iterator();
if (currentStripe >= 0) {
if (stripeStatisticsValidation.isPresent()) {
StatisticsValidation statisticsValidation = stripeStatisticsValidation.get();
long offset = stripes.get(currentStripe).getOffset();
writeValidation.get().validateStripeStatistics(orcDataSource.getId(), offset, statisticsValidation.build().get());
statisticsValidation.reset();
}
}
currentStripe++;
if (currentStripe >= stripes.size()) {
return;
}
if (currentStripe > 0) {
currentStripePosition += stripes.get(currentStripe - 1).getNumberOfRows();
}
StripeInformation stripeInformation = stripes.get(currentStripe);
validateWriteStripe(stripeInformation.getNumberOfRows());
Stripe stripe = stripeReader.readStripe(stripeInformation, currentStripeMemoryContext);
if (stripe != null) {
// Give readers access to dictionary streams
InputStreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources();
ColumnMetadata<ColumnEncoding> columnEncodings = stripe.getColumnEncodings();
ZoneId fileTimeZone = stripe.getFileTimeZone();
for (ColumnReader column : columnReaders) {
if (column != null) {
column.startStripe(fileTimeZone, dictionaryStreamSources, columnEncodings);
}
}
rowGroups = stripe.getRowGroups().iterator();
}
orcDataSourceMemoryUsage.setBytes(orcDataSource.getRetainedSize());
}
use of io.trino.orc.stream.InputStreamSources in project trino by trinodb.
the class OrcRecordReader method advanceToNextRowGroup.
private boolean advanceToNextRowGroup() throws IOException {
nextRowInGroup = 0;
if (currentRowGroup >= 0) {
if (rowGroupStatisticsValidation.isPresent()) {
StatisticsValidation statisticsValidation = rowGroupStatisticsValidation.get();
long offset = stripes.get(currentStripe).getOffset();
writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), offset, currentRowGroup, statisticsValidation.build().get());
statisticsValidation.reset();
}
}
while (!rowGroups.hasNext() && currentStripe < stripes.size()) {
advanceToNextStripe();
currentRowGroup = -1;
}
if (!rowGroups.hasNext()) {
currentGroupRowCount = 0;
return false;
}
currentRowGroup++;
RowGroup currentRowGroup = rowGroups.next();
currentGroupRowCount = currentRowGroup.getRowCount();
if (currentRowGroup.getMinAverageRowBytes() > 0) {
maxBatchSize = toIntExact(min(maxBatchSize, max(1, maxBlockBytes / currentRowGroup.getMinAverageRowBytes())));
}
currentPosition = currentStripePosition + currentRowGroup.getRowOffset();
filePosition = stripeFilePositions.get(currentStripe) + currentRowGroup.getRowOffset();
// give reader data streams from row group
InputStreamSources rowGroupStreamSources = currentRowGroup.getStreamSources();
for (ColumnReader column : columnReaders) {
if (column != null) {
column.startRowGroup(rowGroupStreamSources);
}
}
return true;
}
use of io.trino.orc.stream.InputStreamSources in project trino by trinodb.
the class StripeReader method createDictionaryStreamSources.
private InputStreamSources createDictionaryStreamSources(Map<StreamId, Stream> streams, Map<StreamId, ValueInputStream<?>> valueStreams, ColumnMetadata<ColumnEncoding> columnEncodings) {
ImmutableMap.Builder<StreamId, InputStreamSource<?>> dictionaryStreamBuilder = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
OrcColumnId column = stream.getColumnId();
// only process dictionary streams
ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind();
if (!isDictionary(stream, columnEncoding)) {
continue;
}
// skip streams without data
ValueInputStream<?> valueStream = valueStreams.get(streamId);
if (valueStream == null) {
continue;
}
OrcTypeKind columnType = types.get(stream.getColumnId()).getOrcTypeKind();
StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding);
InputStreamSource<?> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint);
dictionaryStreamBuilder.put(streamId, streamSource);
}
return new InputStreamSources(dictionaryStreamBuilder.buildOrThrow());
}
use of io.trino.orc.stream.InputStreamSources in project trino by trinodb.
the class StripeReader method createRowGroup.
private static RowGroup createRowGroup(int groupId, int rowOffset, int rowCount, long minAverageRowBytes, Map<StreamId, ValueInputStream<?>> valueStreams, Map<StreamId, StreamCheckpoint> checkpoints) {
ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
for (Entry<StreamId, StreamCheckpoint> entry : checkpoints.entrySet()) {
StreamId streamId = entry.getKey();
StreamCheckpoint checkpoint = entry.getValue();
// skip streams without data
ValueInputStream<?> valueStream = valueStreams.get(streamId);
if (valueStream == null) {
continue;
}
builder.put(streamId, createCheckpointStreamSource(valueStream, checkpoint));
}
InputStreamSources rowGroupStreams = new InputStreamSources(builder.buildOrThrow());
return new RowGroup(groupId, rowOffset, rowCount, minAverageRowBytes, rowGroupStreams);
}
Aggregations