use of io.prestosql.orc.metadata.StripeInformation in project hetu-core by openlookeng.
the class OrcRecordReader method filterRows.
private Block filterRows(Block block) {
// currentPosition to currentBatchSize
StripeInformation stripe = stripes.get(currentStripe);
if (matchingRowsInBatchArray == null && stripeMatchingRows.containsKey(stripe) && block.getPositionCount() != 0) {
long currentPositionInStripe = currentPosition - currentStripePosition;
PeekingIterator<Integer> matchingRows = stripeMatchingRows.get(stripe);
List<Integer> matchingRowsInBlock = new ArrayList<>();
while (matchingRows.hasNext()) {
Integer row = matchingRows.peek();
if (row < currentPositionInStripe) {
// this can happen if a row group containing matching rows was filtered out
// for example, if matchingRows is for column1 but query is for column1 and column2.
// since row groups have minmax values, a row group could have been filtered out because of
// column2 predicate. this means that the current matchingRow could be 10 (within the first
// row group), but the first row group might've been filtered out due to column2 predicate,
// so currentPositionInStripe is already in second row group
//
// stripe 1
// -> row group 1 (rows 1 to 10000) [filtered out due to column2 predicate]
// 1
// 2
// ...
// 10 <- matchingRows cursor is here, but this row group has been filtered out
// ...
// 10000
// -> row group 2 (rows 10001 to 20000)
// 10001
// 10002 <- currentPositionInStripe is here
// ...
// 20000
matchingRows.next();
} else if (row < currentPositionInStripe + currentBatchSize) {
// matchingRows cursor is within current batch
matchingRowsInBlock.add(toIntExact(Long.valueOf(row) - currentPositionInStripe));
matchingRows.next();
} else {
// matchingRows cursor is ahead of current batch, next batch will use it
break;
}
}
matchingRowsInBatchArray = new int[matchingRowsInBlock.size()];
IntStream.range(0, matchingRowsInBlock.size()).forEach(i -> matchingRowsInBatchArray[i] = matchingRowsInBlock.get(i));
}
if (matchingRowsInBatchArray != null) {
return block.copyPositions(matchingRowsInBatchArray, 0, matchingRowsInBatchArray.length);
}
return block;
}
use of io.prestosql.orc.metadata.StripeInformation in project hetu-core by openlookeng.
the class OrcSelectiveRecordReader method initializePositions.
private int[] initializePositions(int batchSize) {
// currentPosition to currentBatchSize
StripeInformation stripe = stripes.get(currentStripe);
if (matchingRowsInBatchArray == null && stripeMatchingRows.containsKey(stripe)) {
long currentPositionInStripe = currentPosition - currentStripePosition;
PeekingIterator<Integer> matchingRows = stripeMatchingRows.get(stripe);
List<Integer> matchingRowsInBlock = new ArrayList<>();
while (matchingRows.hasNext()) {
Integer row = matchingRows.peek();
if (row < currentPositionInStripe) {
// this can happen if a row group containing matching rows was filtered out
// for example, if matchingRows is for column1 but query is for column1 and column2.
// since row groups have minmax values, a row group could have been filtered out because of
// column2 predicate. this means that the current matchingRow could be 10 (within the first
// row group), but the first row group might've been filtered out due to column2 predicate,
// so currentPositionInStripe is already in second row group
//
// stripe 1
// -> row group 1 (rows 1 to 10000) [filtered out due to column2 predicate]
// 1
// 2
// ...
// 10 <- matchingRows cursor is here, but this row group has been filtered out
// ...
// 10000
// -> row group 2 (rows 10001 to 20000)
// 10001
// 10002 <- currentPositionInStripe is here
// ...
// 20000
matchingRows.next();
} else if (row < currentPositionInStripe + batchSize) {
// matchingRows cursor is within current batch
matchingRowsInBlock.add(toIntExact(Long.valueOf(row) - currentPositionInStripe));
matchingRows.next();
} else {
// matchingRows cursor is ahead of current batch, next batch will use it
break;
}
}
matchingRowsInBatchArray = new int[matchingRowsInBlock.size()];
IntStream.range(0, matchingRowsInBlock.size()).forEach(i -> matchingRowsInBatchArray[i] = matchingRowsInBlock.get(i));
}
if (matchingRowsInBatchArray != null) {
return matchingRowsInBatchArray;
} else {
int[] positions = new int[batchSize];
for (int i = 0; i < batchSize; i++) {
positions[i] = i;
}
return positions;
}
}
use of io.prestosql.orc.metadata.StripeInformation in project hetu-core by openlookeng.
the class AbstractOrcRecordReader method advanceToNextStripe.
private void advanceToNextStripe() throws IOException {
currentStripeSystemMemoryContext.close();
currentStripeSystemMemoryContext = systemMemoryUsage.newAggregatedMemoryContext();
rowGroups = ImmutableList.<RowGroup>of().iterator();
if (currentStripe >= 0) {
if (stripeStatisticsValidation.isPresent()) {
OrcWriteValidation.StatisticsValidation statisticsValidation = stripeStatisticsValidation.get();
long offset = stripes.get(currentStripe).getOffset();
writeValidation.get().validateStripeStatistics(orcDataSource.getId(), offset, statisticsValidation.build().get());
statisticsValidation.reset();
}
}
currentStripe++;
if (currentStripe >= stripes.size()) {
return;
}
if (currentStripe > 0) {
currentStripePosition += stripes.get(currentStripe - 1).getNumberOfRows();
}
StripeInformation stripeInformation = stripes.get(currentStripe);
validateWriteStripe(stripeInformation.getNumberOfRows());
Stripe stripe = stripeReader.readStripe(stripeInformation, currentStripeSystemMemoryContext);
if (stripe != null) {
// Give readers access to dictionary streams
InputStreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources();
ColumnMetadata<ColumnEncoding> columnEncodings = stripe.getColumnEncodings();
for (AbstractColumnReader columnReader : columnReaders) {
if (columnReader != null) {
ZoneId fileTimeZone = stripe.getFileTimeZone();
columnReader.startStripe(fileTimeZone, dictionaryStreamSources, columnEncodings);
}
}
rowGroups = stripe.getRowGroups().iterator();
}
}
use of io.prestosql.orc.metadata.StripeInformation in project hetu-core by openlookeng.
the class TestCachingOrcDataSource method doIntegration.
private void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize tinyStripeThreshold) throws IOException {
OrcReader orcReader = new OrcReader(orcDataSource, maxMergeDistance, tinyStripeThreshold, new DataSize(1, Unit.MEGABYTE));
// 1 for reading file footer
assertEquals(orcDataSource.getReadCount(), 1);
List<StripeInformation> stripes = orcReader.getFooter().getStripes();
// Sanity check number of stripes. This can be three or higher because of orc writer low memory mode.
assertGreaterThanOrEqual(stripes.size(), 3);
// verify wrapped by CachingOrcReader
assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class);
OrcRecordReader orcRecordReader = orcReader.createRecordReader(orcReader.getRootColumn().getNestedColumns(), ImmutableList.of(VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE, RuntimeException::new);
int positionCount = 0;
while (true) {
Page page = orcRecordReader.nextPage();
if (page == null) {
break;
}
page = page.getLoadedPage();
Block block = page.getBlock(0);
positionCount += block.getPositionCount();
}
assertEquals(positionCount, POSITION_COUNT);
}
use of io.prestosql.orc.metadata.StripeInformation in project hetu-core by openlookeng.
the class OrcWriter method bufferStripeData.
/**
* Collect the data for for the stripe. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
if (stripeRowCount == 0) {
verify(flushReason == CLOSED, "An empty stripe is not allowed");
// column writers must be closed or the reset call will fail
columnWriters.forEach(ColumnWriter::close);
return ImmutableList.of();
}
if (rowGroupRowCount > 0) {
finishRowGroup();
}
// convert any dictionary encoded column with a low compression ratio to direct
dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
columnWriters.forEach(ColumnWriter::close);
List<OrcDataOutput> outputData = new ArrayList<>();
List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
// get index streams
long indexLength = 0;
for (ColumnWriter columnWriter : columnWriters) {
for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
// The ordering is critical because the stream only contain a length with no offset.
outputData.add(indexStream);
allStreams.add(indexStream.getStream());
indexLength += indexStream.size();
}
}
// data streams (sorted by size)
long dataLength = 0;
List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
for (ColumnWriter columnWriter : columnWriters) {
List<StreamDataOutput> streams = columnWriter.getDataStreams();
dataStreams.addAll(streams);
dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
}
Collections.sort(dataStreams);
// add data streams
for (StreamDataOutput dataStream : dataStreams) {
// The ordering is critical because the stream only contain a length with no offset.
outputData.add(dataStream);
allStreams.add(dataStream.getStream());
}
Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
// the 0th column is a struct column for the whole row
columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null));
// add footer
StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
outputData.add(createDataOutput(footer));
// create final stripe statistics
StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
closedStripes.add(closedStripe);
closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
return outputData;
}
Aggregations