use of io.prestosql.orc.reader.SelectiveColumnReader in project hetu-core by openlookeng.
the class OrcSelectiveRecordReader method getNextPage.
/* Perform ORC read by eliminating non-matching records before forming the data blocks
* This is carried out in 3 stages;
* I) Filter using Conjuncts (AND'd operators)
* II) Filter using discjuncts (OR'd operators)
* III) Fields without filters
* Finally compose the block/page with the matching records.
*/
public Page getNextPage() throws IOException {
int batchSize = prepareNextBatch();
if (batchSize < 0) {
return null;
}
// is true means no record will qualify filter in current split.
if (constantFilterIsFalse && colReaderWithORFilter.isEmpty()) {
batchRead(batchSize);
return new Page(0);
}
matchingRowsInBatchArray = null;
int[] positionsToRead = initializePositions(batchSize);
int positionCount = positionsToRead.length;
/* first evaluate columns with filter AND conditions */
SelectiveColumnReader[] columnReaders = getColumnReaders();
if (positionCount != 0) {
for (Integer columnIdx : colReaderWithFilter) {
if (columnIdx < 0) {
if (!matchConstantWithPredicate(includedColumns.get(columnIdx), constantValues.get(columnIdx), filters.get(columnIdx))) {
positionCount = 0;
break;
}
} else if (missingColumns.contains(columnIdx)) {
if (!filters.get(columnIdx).testNull()) {
positionCount = 0;
break;
}
} else if (columnReaders[columnIdx] != null) {
positionCount = columnReaders[columnIdx].read(getNextRowInGroup(), positionsToRead, positionCount, filters.get(columnIdx));
if (positionCount == 0) {
break;
}
// Get list of row position to read for the next column. Output of positions from current column is
// input to the next column
positionsToRead = columnReaders[columnIdx].getReadPositions();
}
}
}
/* Perform OR filtering:
* OR filtering is applied with 2 level; identify match and exclude.
* - Identify: Read all the matching records using the available positions by apply the filter to each
* row position from Stage-1; and accumulate position in the accumulator.
* - Exclude: Exclude all records which did not matched in any pass of the filters applied; i.e.
* accumulator set becomes the final matching row positions.
*/
BitSet accumulator = new BitSet();
if (colReaderWithORFilter.size() > 0 && positionCount > 0) {
int localPositionCount = positionCount;
for (Integer columnIdx : colReaderWithORFilter) {
if (columnIdx < 0) {
if (matchConstantWithPredicate(includedColumns.get(columnIdx), constantValues.get(columnIdx), disjuctFilters.get(columnIdx).get(0))) {
/* Skip OR filtering all will match */
accumulator.set(positionsToRead[0], positionsToRead[positionCount - 1] + 1);
}
} else if (missingColumns.contains(columnIdx)) {
if (disjuctFilters.get(columnIdx).get(0).testNull()) {
/* Skip OR filtering all will match */
accumulator.set(positionsToRead[0], positionsToRead[positionCount - 1] + 1);
}
} else if (columnReaders[columnIdx] != null) {
localPositionCount += columnReaders[columnIdx].readOr(getNextRowInGroup(), positionsToRead, positionCount, disjuctFilters.get(columnIdx), accumulator);
}
}
int[] newPositions = positionsToRead.clone();
positionCount = updateExcludePositions(positionsToRead, positionCount, accumulator, newPositions);
positionsToRead = Arrays.copyOf(newPositions, positionCount);
}
if (positionCount != 0) {
for (Integer columnIdx : colReaderWithoutFilter) {
if (columnReaders[columnIdx] != null) {
positionCount = columnReaders[columnIdx].read(getNextRowInGroup(), positionsToRead, positionCount, null);
if (positionCount == 0) {
break;
}
// Get list of row position to read for the next column. Output of positions from current column is
// input to the next column
positionsToRead = columnReaders[columnIdx].getReadPositions();
}
}
}
batchRead(batchSize);
if (positionCount == 0) {
return new Page(0);
}
// Finally this makes block of all output columns in the page. Row positions list from the last column are the
// final positions list after applying all projection. Only that final row position will be used to get data of
// all columns and form the page block.
Block[] blocks = new Block[outputColumns.size()];
for (int i = 0; i < outputColumns.size(); i++) {
int columnIndex = outputColumns.get(i);
if (columnIndex < 0 || missingColumns.contains(columnIndex)) {
// To fill partition key.
blocks[i] = RunLengthEncodedBlock.create(includedColumns.get(columnIndex), constantValues.get(columnIndex) == NULL_MARKER ? null : constantValues.get(columnIndex), positionCount);
} else {
Block block = getColumnReaders()[columnIndex].getBlock(positionsToRead, positionCount);
updateMaxCombinedBytesPerRow(columnIndex, block);
if (coercers.containsKey(i)) {
block = coercers.get(i).apply(block);
}
blocks[i] = block;
}
}
Page page = new Page(positionCount, blocks);
validateWritePageChecksum(page);
return page;
}
use of io.prestosql.orc.reader.SelectiveColumnReader in project hetu-core by openlookeng.
the class OrcSelectiveRecordReader method createColumnReaders.
public SelectiveColumnReader[] createColumnReaders(List<OrcColumn> fileColumns, AggregatedMemoryContext systemMemoryContext, OrcBlockFactory blockFactory, OrcCacheStore orcCacheStore, OrcCacheProperties orcCacheProperties, OrcPredicate predicate, Map<Integer, TupleDomainFilter> filters, DateTimeZone hiveStorageTimeZone, List<Integer> outputColumns, Map<Integer, Type> includedColumns, ColumnMetadata<OrcType> orcTypes, boolean useDataCache) throws OrcCorruptionException {
int fieldCount = orcTypes.get(OrcColumnId.ROOT_COLUMN).getFieldCount();
SelectiveColumnReader[] columnReaders = new SelectiveColumnReader[fieldCount];
colReaderWithFilter = new IntArraySet();
colReaderWithORFilter = new IntArraySet();
colReaderWithoutFilter = new IntArraySet();
IntArraySet remainingColumns = new IntArraySet();
remainingColumns.addAll(includedColumns.keySet());
for (int i = 0; i < fieldCount; i++) {
// create column reader only for columns which are part of projection and filter.
if (includedColumns.containsKey(i)) {
int columnIndex = i;
OrcColumn column = fileColumns.get(columnIndex);
boolean outputRequired = outputColumns.contains(i);
SelectiveColumnReader columnReader = null;
if (useDataCache && orcCacheProperties.isRowDataCacheEnabled()) {
ColumnReader cr = ColumnReaders.createColumnReader(includedColumns.get(i), column, systemMemoryContext, blockFactory.createNestedBlockFactory(block -> blockLoaded(columnIndex, block)));
columnReader = SelectiveColumnReaders.wrapWithDataCachingStreamReader(cr, column, orcCacheStore.getRowDataCache());
} else {
columnReader = createColumnReader(orcTypes.get(column.getColumnId()), column, Optional.ofNullable(filters.get(i)), outputRequired ? Optional.of(includedColumns.get(i)) : Optional.empty(), hiveStorageTimeZone, systemMemoryContext);
if (orcCacheProperties.isRowDataCacheEnabled()) {
columnReader = SelectiveColumnReaders.wrapWithResultCachingStreamReader(columnReader, column, predicate, orcCacheStore.getRowDataCache());
}
}
columnReaders[columnIndex] = columnReader;
if (filters.get(i) != null) {
colReaderWithFilter.add(columnIndex);
} else if (disjuctFilters.get(i) != null && disjuctFilters.get(i).size() > 0) {
colReaderWithORFilter.add(columnIndex);
} else {
colReaderWithoutFilter.add(columnIndex);
}
remainingColumns.remove(columnIndex);
}
}
/* if any still remaining colIdx < 0 */
remainingColumns.removeAll(missingColumns);
for (Integer col : remainingColumns) {
if (col < 0) {
/* should be always true! */
if (filters.get(col) != null) {
colReaderWithFilter.add(col);
} else if (disjuctFilters.get(col) != null && disjuctFilters.get(col).size() > 0) {
colReaderWithORFilter.add(col);
}
}
}
// specially for alter add column case:
for (int missingColumn : missingColumns) {
if (filters.get(missingColumn) != null) {
colReaderWithFilter.add(missingColumn);
} else if (disjuctFilters.get(missingColumn) != null && disjuctFilters.get(missingColumn).size() > 0) {
colReaderWithORFilter.add(missingColumn);
}
}
return columnReaders;
}
Aggregations