Search in sources :

Example 1 with SelectiveColumnReader

use of io.prestosql.orc.reader.SelectiveColumnReader in project hetu-core by openlookeng.

the class OrcSelectiveRecordReader method getNextPage.

/* Perform ORC read by eliminating non-matching records before forming the data blocks
     * This is carried out in 3 stages;
     *  I)   Filter using Conjuncts (AND'd operators)
     *  II)  Filter using discjuncts (OR'd operators)
     *  III) Fields without filters
     *  Finally compose the block/page with the matching records.
     */
public Page getNextPage() throws IOException {
    int batchSize = prepareNextBatch();
    if (batchSize < 0) {
        return null;
    }
    // is true means no record will qualify filter in current split.
    if (constantFilterIsFalse && colReaderWithORFilter.isEmpty()) {
        batchRead(batchSize);
        return new Page(0);
    }
    matchingRowsInBatchArray = null;
    int[] positionsToRead = initializePositions(batchSize);
    int positionCount = positionsToRead.length;
    /* first evaluate columns with filter AND conditions */
    SelectiveColumnReader[] columnReaders = getColumnReaders();
    if (positionCount != 0) {
        for (Integer columnIdx : colReaderWithFilter) {
            if (columnIdx < 0) {
                if (!matchConstantWithPredicate(includedColumns.get(columnIdx), constantValues.get(columnIdx), filters.get(columnIdx))) {
                    positionCount = 0;
                    break;
                }
            } else if (missingColumns.contains(columnIdx)) {
                if (!filters.get(columnIdx).testNull()) {
                    positionCount = 0;
                    break;
                }
            } else if (columnReaders[columnIdx] != null) {
                positionCount = columnReaders[columnIdx].read(getNextRowInGroup(), positionsToRead, positionCount, filters.get(columnIdx));
                if (positionCount == 0) {
                    break;
                }
                // Get list of row position to read for the next column. Output of positions from current column is
                // input to the next column
                positionsToRead = columnReaders[columnIdx].getReadPositions();
            }
        }
    }
    /* Perform OR filtering:
         *    OR filtering is applied with 2 level; identify match and exclude.
         *    -  Identify:  Read all the matching records using the available positions by apply the filter to each
         *                  row position from Stage-1; and accumulate position in the accumulator.
         *    - Exclude:    Exclude all records which did not matched in any pass of the filters applied; i.e.
         *                  accumulator set becomes the final matching row positions.
         */
    BitSet accumulator = new BitSet();
    if (colReaderWithORFilter.size() > 0 && positionCount > 0) {
        int localPositionCount = positionCount;
        for (Integer columnIdx : colReaderWithORFilter) {
            if (columnIdx < 0) {
                if (matchConstantWithPredicate(includedColumns.get(columnIdx), constantValues.get(columnIdx), disjuctFilters.get(columnIdx).get(0))) {
                    /* Skip OR filtering all will match */
                    accumulator.set(positionsToRead[0], positionsToRead[positionCount - 1] + 1);
                }
            } else if (missingColumns.contains(columnIdx)) {
                if (disjuctFilters.get(columnIdx).get(0).testNull()) {
                    /* Skip OR filtering all will match */
                    accumulator.set(positionsToRead[0], positionsToRead[positionCount - 1] + 1);
                }
            } else if (columnReaders[columnIdx] != null) {
                localPositionCount += columnReaders[columnIdx].readOr(getNextRowInGroup(), positionsToRead, positionCount, disjuctFilters.get(columnIdx), accumulator);
            }
        }
        int[] newPositions = positionsToRead.clone();
        positionCount = updateExcludePositions(positionsToRead, positionCount, accumulator, newPositions);
        positionsToRead = Arrays.copyOf(newPositions, positionCount);
    }
    if (positionCount != 0) {
        for (Integer columnIdx : colReaderWithoutFilter) {
            if (columnReaders[columnIdx] != null) {
                positionCount = columnReaders[columnIdx].read(getNextRowInGroup(), positionsToRead, positionCount, null);
                if (positionCount == 0) {
                    break;
                }
                // Get list of row position to read for the next column. Output of positions from current column is
                // input to the next column
                positionsToRead = columnReaders[columnIdx].getReadPositions();
            }
        }
    }
    batchRead(batchSize);
    if (positionCount == 0) {
        return new Page(0);
    }
    // Finally this makes block of all output columns in the page. Row positions list from the last column are the
    // final positions list after applying all projection. Only that final row position will be used to get data of
    // all columns and form the page block.
    Block[] blocks = new Block[outputColumns.size()];
    for (int i = 0; i < outputColumns.size(); i++) {
        int columnIndex = outputColumns.get(i);
        if (columnIndex < 0 || missingColumns.contains(columnIndex)) {
            // To fill partition key.
            blocks[i] = RunLengthEncodedBlock.create(includedColumns.get(columnIndex), constantValues.get(columnIndex) == NULL_MARKER ? null : constantValues.get(columnIndex), positionCount);
        } else {
            Block block = getColumnReaders()[columnIndex].getBlock(positionsToRead, positionCount);
            updateMaxCombinedBytesPerRow(columnIndex, block);
            if (coercers.containsKey(i)) {
                block = coercers.get(i).apply(block);
            }
            blocks[i] = block;
        }
    }
    Page page = new Page(positionCount, blocks);
    validateWritePageChecksum(page);
    return page;
}
Also used : SelectiveColumnReader(io.prestosql.orc.reader.SelectiveColumnReader) BitSet(java.util.BitSet) RunLengthEncodedBlock(io.prestosql.spi.block.RunLengthEncodedBlock) Block(io.prestosql.spi.block.Block) Page(io.prestosql.spi.Page)

Example 2 with SelectiveColumnReader

use of io.prestosql.orc.reader.SelectiveColumnReader in project hetu-core by openlookeng.

the class OrcSelectiveRecordReader method createColumnReaders.

public SelectiveColumnReader[] createColumnReaders(List<OrcColumn> fileColumns, AggregatedMemoryContext systemMemoryContext, OrcBlockFactory blockFactory, OrcCacheStore orcCacheStore, OrcCacheProperties orcCacheProperties, OrcPredicate predicate, Map<Integer, TupleDomainFilter> filters, DateTimeZone hiveStorageTimeZone, List<Integer> outputColumns, Map<Integer, Type> includedColumns, ColumnMetadata<OrcType> orcTypes, boolean useDataCache) throws OrcCorruptionException {
    int fieldCount = orcTypes.get(OrcColumnId.ROOT_COLUMN).getFieldCount();
    SelectiveColumnReader[] columnReaders = new SelectiveColumnReader[fieldCount];
    colReaderWithFilter = new IntArraySet();
    colReaderWithORFilter = new IntArraySet();
    colReaderWithoutFilter = new IntArraySet();
    IntArraySet remainingColumns = new IntArraySet();
    remainingColumns.addAll(includedColumns.keySet());
    for (int i = 0; i < fieldCount; i++) {
        // create column reader only for columns which are part of projection and filter.
        if (includedColumns.containsKey(i)) {
            int columnIndex = i;
            OrcColumn column = fileColumns.get(columnIndex);
            boolean outputRequired = outputColumns.contains(i);
            SelectiveColumnReader columnReader = null;
            if (useDataCache && orcCacheProperties.isRowDataCacheEnabled()) {
                ColumnReader cr = ColumnReaders.createColumnReader(includedColumns.get(i), column, systemMemoryContext, blockFactory.createNestedBlockFactory(block -> blockLoaded(columnIndex, block)));
                columnReader = SelectiveColumnReaders.wrapWithDataCachingStreamReader(cr, column, orcCacheStore.getRowDataCache());
            } else {
                columnReader = createColumnReader(orcTypes.get(column.getColumnId()), column, Optional.ofNullable(filters.get(i)), outputRequired ? Optional.of(includedColumns.get(i)) : Optional.empty(), hiveStorageTimeZone, systemMemoryContext);
                if (orcCacheProperties.isRowDataCacheEnabled()) {
                    columnReader = SelectiveColumnReaders.wrapWithResultCachingStreamReader(columnReader, column, predicate, orcCacheStore.getRowDataCache());
                }
            }
            columnReaders[columnIndex] = columnReader;
            if (filters.get(i) != null) {
                colReaderWithFilter.add(columnIndex);
            } else if (disjuctFilters.get(i) != null && disjuctFilters.get(i).size() > 0) {
                colReaderWithORFilter.add(columnIndex);
            } else {
                colReaderWithoutFilter.add(columnIndex);
            }
            remainingColumns.remove(columnIndex);
        }
    }
    /* if any still remaining colIdx < 0 */
    remainingColumns.removeAll(missingColumns);
    for (Integer col : remainingColumns) {
        if (col < 0) {
            /* should be always true! */
            if (filters.get(col) != null) {
                colReaderWithFilter.add(col);
            } else if (disjuctFilters.get(col) != null && disjuctFilters.get(col).size() > 0) {
                colReaderWithORFilter.add(col);
            }
        }
    }
    // specially for alter add column case:
    for (int missingColumn : missingColumns) {
        if (filters.get(missingColumn) != null) {
            colReaderWithFilter.add(missingColumn);
        } else if (disjuctFilters.get(missingColumn) != null && disjuctFilters.get(missingColumn).size() > 0) {
            colReaderWithORFilter.add(missingColumn);
        }
    }
    return columnReaders;
}
Also used : IntStream(java.util.stream.IntStream) StripeStatistics(io.prestosql.orc.metadata.statistics.StripeStatistics) DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) RunLengthEncodedBlock(io.prestosql.spi.block.RunLengthEncodedBlock) TypeNotFoundException(io.prestosql.spi.type.TypeNotFoundException) PeekingIterator(com.google.common.collect.PeekingIterator) Function(java.util.function.Function) PostScript(io.prestosql.orc.metadata.PostScript) ArrayList(java.util.ArrayList) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) Type(io.prestosql.spi.type.Type) Math.toIntExact(java.lang.Math.toIntExact) SelectiveColumnReaders(io.prestosql.orc.reader.SelectiveColumnReaders) Block(io.prestosql.spi.block.Block) ColumnReaders(io.prestosql.orc.reader.ColumnReaders) SelectiveColumnReader(io.prestosql.orc.reader.SelectiveColumnReader) OrcType(io.prestosql.orc.metadata.OrcType) IntArraySet(it.unimi.dsi.fastutil.ints.IntArraySet) Set(java.util.Set) Page(io.prestosql.spi.Page) IOException(java.io.IOException) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) ColumnReader(io.prestosql.orc.reader.ColumnReader) MetadataReader(io.prestosql.orc.metadata.MetadataReader) StripeInformation(io.prestosql.orc.metadata.StripeInformation) DataSize(io.airlift.units.DataSize) List(java.util.List) SelectiveColumnReaders.createColumnReader(io.prestosql.orc.reader.SelectiveColumnReaders.createColumnReader) Domain(io.prestosql.spi.predicate.Domain) ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) Optional(java.util.Optional) BitSet(java.util.BitSet) IndexMetadata(io.prestosql.spi.heuristicindex.IndexMetadata) SelectiveColumnReader(io.prestosql.orc.reader.SelectiveColumnReader) IntArraySet(it.unimi.dsi.fastutil.ints.IntArraySet) SelectiveColumnReader(io.prestosql.orc.reader.SelectiveColumnReader) ColumnReader(io.prestosql.orc.reader.ColumnReader) SelectiveColumnReaders.createColumnReader(io.prestosql.orc.reader.SelectiveColumnReaders.createColumnReader)

Aggregations

SelectiveColumnReader (io.prestosql.orc.reader.SelectiveColumnReader)2 Page (io.prestosql.spi.Page)2 Block (io.prestosql.spi.block.Block)2 RunLengthEncodedBlock (io.prestosql.spi.block.RunLengthEncodedBlock)2 BitSet (java.util.BitSet)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 PeekingIterator (com.google.common.collect.PeekingIterator)1 Logger (io.airlift.log.Logger)1 Slice (io.airlift.slice.Slice)1 DataSize (io.airlift.units.DataSize)1 AggregatedMemoryContext (io.prestosql.memory.context.AggregatedMemoryContext)1 ColumnMetadata (io.prestosql.orc.metadata.ColumnMetadata)1 MetadataReader (io.prestosql.orc.metadata.MetadataReader)1 OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)1 OrcType (io.prestosql.orc.metadata.OrcType)1 PostScript (io.prestosql.orc.metadata.PostScript)1 StripeInformation (io.prestosql.orc.metadata.StripeInformation)1 ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)1 StripeStatistics (io.prestosql.orc.metadata.statistics.StripeStatistics)1 ColumnReader (io.prestosql.orc.reader.ColumnReader)1