use of com.facebook.presto.orc.stream.InputStreamSources in project presto by prestodb.
the class TestLongDictionaryProvider method createLongDictionaryStreamSources.
private InputStreamSources createLongDictionaryStreamSources(Map<NodeId, long[]> streams, OrcAggregatedMemoryContext aggregatedMemoryContext) {
SharedBuffer decompressionBuffer = new SharedBuffer(aggregatedMemoryContext.newOrcLocalMemoryContext("sharedDecompressionBuffer"));
ImmutableMap.Builder<StreamId, InputStreamSource<?>> dictionaryStreamsBuilder = ImmutableMap.builder();
for (Map.Entry<NodeId, long[]> entry : streams.entrySet()) {
StreamId streamId = entry.getKey().toDictionaryDataStreamId();
DynamicSliceOutput sliceOutput = createSliceOutput(streamId, entry.getValue());
ValueInputStream<?> valueStream = createValueStream(sliceOutput.slice(), aggregatedMemoryContext, decompressionBuffer);
StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, LONG, ColumnEncoding.ColumnEncodingKind.DICTIONARY);
InputStreamSource<?> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint);
dictionaryStreamsBuilder.put(streamId, streamSource);
}
return new InputStreamSources(dictionaryStreamsBuilder.build());
}
use of com.facebook.presto.orc.stream.InputStreamSources in project presto by prestodb.
the class StripeReader method readStripe.
public Stripe readStripe(StripeInformation stripe, OrcAggregatedMemoryContext systemMemoryUsage, Optional<DwrfEncryptionInfo> decryptors, SharedBuffer sharedDecompressionBuffer) throws IOException {
StripeId stripeId = new StripeId(orcDataSource.getId(), stripe.getOffset());
// read the stripe footer
StripeFooter stripeFooter = readStripeFooter(stripeId, stripe, systemMemoryUsage);
// get streams for selected columns
List<List<Stream>> allStreams = new ArrayList<>();
allStreams.add(stripeFooter.getStreams());
Map<StreamId, Stream> includedStreams = new HashMap<>();
boolean hasRowGroupDictionary = addIncludedStreams(stripeFooter.getColumnEncodings(), stripeFooter.getStreams(), includedStreams);
Map<Integer, ColumnEncoding> columnEncodings = new HashMap<>();
Map<Integer, ColumnEncoding> stripeFooterEncodings = stripeFooter.getColumnEncodings();
columnEncodings.putAll(stripeFooterEncodings);
// included columns may be encrypted
if (decryptors.isPresent()) {
List<Slice> encryptedEncryptionGroups = stripeFooter.getStripeEncryptionGroups();
for (Integer groupId : decryptors.get().getEncryptorGroupIds()) {
StripeEncryptionGroup stripeEncryptionGroup = getStripeEncryptionGroup(decryptors.get().getEncryptorByGroupId(groupId), encryptedEncryptionGroups.get(groupId), dwrfEncryptionGroupColumns.get(groupId), systemMemoryUsage);
allStreams.add(stripeEncryptionGroup.getStreams());
columnEncodings.putAll(stripeEncryptionGroup.getColumnEncodings());
boolean encryptedHasRowGroupDictionary = addIncludedStreams(stripeEncryptionGroup.getColumnEncodings(), stripeEncryptionGroup.getStreams(), includedStreams);
hasRowGroupDictionary = encryptedHasRowGroupDictionary || hasRowGroupDictionary;
}
}
// handle stripes with more than one row group or a dictionary
boolean invalidCheckPoint = false;
if ((stripe.getNumberOfRows() > rowsInRowGroup) || hasRowGroupDictionary) {
// determine ranges of the stripe to read
Map<StreamId, DiskRange> diskRanges = getDiskRanges(allStreams);
diskRanges = Maps.filterKeys(diskRanges, Predicates.in(includedStreams.keySet()));
// read the file regions
Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripeId, diskRanges, systemMemoryUsage, decryptors, sharedDecompressionBuffer);
// read the row index for each column
Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(includedStreams, streamsData, stripeId);
if (writeValidation.isPresent()) {
writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes);
}
// select the row groups matching the tuple domain
Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
// if all row groups are skipped, return null
if (selectedRowGroups.isEmpty()) {
// set accounted memory usage to zero
systemMemoryUsage.close();
return null;
}
// value streams
Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(includedStreams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(includedStreams, valueStreams, columnEncodings);
// build the row groups
try {
List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), includedStreams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources);
} catch (InvalidCheckpointException e) {
// we must fail because the length of the row group dictionary is contained in the checkpoint stream.
if (hasRowGroupDictionary) {
throw new OrcCorruptionException(e, orcDataSource.getId(), "Checkpoints are corrupt");
}
invalidCheckPoint = true;
}
}
// stripe only has one row group and no dictionary
ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
for (Entry<StreamId, DiskRange> entry : getDiskRanges(allStreams).entrySet()) {
StreamId streamId = entry.getKey();
if (includedStreams.keySet().contains(streamId)) {
diskRangesBuilder.put(entry);
}
}
ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.build();
// read the file regions
Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripeId, diskRanges, systemMemoryUsage, decryptors, sharedDecompressionBuffer);
long totalBytes = 0;
for (Entry<StreamId, Stream> entry : includedStreams.entrySet()) {
if (entry.getKey().getStreamKind() == ROW_INDEX) {
List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, streamsData.get(entry.getKey()), null);
checkState(rowGroupIndexes.size() == 1 || invalidCheckPoint, "expect a single row group or an invalid check point");
for (RowGroupIndex rowGroupIndex : rowGroupIndexes) {
ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics();
if (columnStatistics.hasMinAverageValueSizeInBytes()) {
totalBytes += columnStatistics.getTotalValueSizeInBytes();
}
}
}
}
// value streams
Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(includedStreams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(includedStreams, valueStreams, columnEncodings);
// build the row group
ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
for (Entry<StreamId, ValueInputStream<?>> entry : valueStreams.entrySet()) {
builder.put(entry.getKey(), new ValueInputStreamSource<>(entry.getValue()));
}
RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), totalBytes, new InputStreamSources(builder.build()));
return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
use of com.facebook.presto.orc.stream.InputStreamSources in project presto by prestodb.
the class AbstractOrcRecordReader method advanceToNextRowGroup.
private boolean advanceToNextRowGroup() throws IOException {
nextRowInGroup = 0;
if (currentRowGroup >= 0) {
if (rowGroupStatisticsValidation.isPresent()) {
OrcWriteValidation.StatisticsValidation statisticsValidation = rowGroupStatisticsValidation.get();
long offset = stripes.get(currentStripe).getOffset();
writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), offset, currentRowGroup, statisticsValidation.build());
statisticsValidation.reset();
}
}
while (!rowGroups.hasNext() && currentStripe < stripes.size()) {
advanceToNextStripe();
currentRowGroup = -1;
}
if (!rowGroups.hasNext()) {
currentGroupRowCount = 0;
return false;
}
currentRowGroup++;
RowGroup currentRowGroup = rowGroups.next();
currentGroupRowCount = toIntExact(currentRowGroup.getRowCount());
if (currentRowGroup.getMinAverageRowBytes() > 0) {
maxBatchSize = adjustMaxBatchSize(maxBatchSize, maxBlockBytes, currentRowGroup.getMinAverageRowBytes());
}
currentPosition = currentStripePosition + currentRowGroup.getRowOffset();
filePosition = stripeFilePositions.get(currentStripe) + currentRowGroup.getRowOffset();
// give reader data streams from row group
InputStreamSources rowGroupStreamSources = currentRowGroup.getStreamSources();
for (StreamReader column : streamReaders) {
if (column != null) {
column.startRowGroup(rowGroupStreamSources);
}
}
return true;
}
use of com.facebook.presto.orc.stream.InputStreamSources in project presto by prestodb.
the class StripeReader method createDictionaryStreamSources.
public InputStreamSources createDictionaryStreamSources(Map<StreamId, Stream> streams, Map<StreamId, ValueInputStream<?>> valueStreams, Map<Integer, ColumnEncoding> columnEncodings) {
ImmutableMap.Builder<StreamId, InputStreamSource<?>> dictionaryStreamBuilder = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
int column = stream.getColumn();
// only process dictionary streams
ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncoding(stream.getSequence()).getColumnEncodingKind();
if (!isDictionary(stream, columnEncoding)) {
continue;
}
// skip streams without data
ValueInputStream<?> valueStream = valueStreams.get(streamId);
if (valueStream == null) {
continue;
}
OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind();
StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding);
InputStreamSource<?> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint);
dictionaryStreamBuilder.put(streamId, streamSource);
}
return new InputStreamSources(dictionaryStreamBuilder.build());
}
use of com.facebook.presto.orc.stream.InputStreamSources in project presto by prestodb.
the class StripeReader method createRowGroup.
@VisibleForTesting
static RowGroup createRowGroup(int groupId, long rowsInStripe, long rowsInRowGroup, Map<StreamId, List<RowGroupIndex>> columnIndexes, Map<StreamId, ValueInputStream<?>> valueStreams, Map<StreamId, StreamCheckpoint> checkpoints) {
long totalRowGroupBytes = columnIndexes.values().stream().mapToLong(e -> e.get(groupId).getColumnStatistics().getTotalValueSizeInBytes()).sum();
long rowOffset = multiplyExact(groupId, rowsInRowGroup);
int rowCount = toIntExact(Math.min(rowsInStripe - rowOffset, rowsInRowGroup));
ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
for (Entry<StreamId, StreamCheckpoint> entry : checkpoints.entrySet()) {
StreamId streamId = entry.getKey();
StreamCheckpoint checkpoint = entry.getValue();
// skip streams without data
ValueInputStream<?> valueStream = valueStreams.get(streamId);
if (valueStream == null) {
continue;
}
builder.put(streamId, createCheckpointStreamSource(valueStream, checkpoint));
}
InputStreamSources rowGroupStreams = new InputStreamSources(builder.build());
return new RowGroup(groupId, rowOffset, rowCount, totalRowGroupBytes, rowGroupStreams);
}
Aggregations