Search in sources :

Example 1 with FileSelection

use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.

the class FileSystemPartitionDescriptor method createTableScan.

@Override
public TableScan createTableScan(List<PartitionLocation> newPartitionLocation, String cacheFileRoot, boolean wasAllPartitionsPruned, MetadataContext metaContext) throws Exception {
    List<String> newFiles = Lists.newArrayList();
    for (final PartitionLocation location : newPartitionLocation) {
        if (!location.isCompositePartition()) {
            newFiles.add(location.getEntirePartitionLocation());
        } else {
            final Collection<SimplePartitionLocation> subPartitions = location.getPartitionLocationRecursive();
            for (final PartitionLocation subPart : subPartitions) {
                newFiles.add(subPart.getEntirePartitionLocation());
            }
        }
    }
    if (scanRel instanceof DrillScanRel) {
        final FormatSelection formatSelection = (FormatSelection) table.getSelection();
        final FileSelection newFileSelection = new FileSelection(null, newFiles, getBaseTableLocation(), cacheFileRoot, wasAllPartitionsPruned, formatSelection.getSelection().getDirStatus());
        newFileSelection.setMetaContext(metaContext);
        final FileGroupScan newGroupScan = ((FileGroupScan) ((DrillScanRel) scanRel).getGroupScan()).clone(newFileSelection);
        return new DrillScanRel(scanRel.getCluster(), scanRel.getTraitSet().plus(DrillRel.DRILL_LOGICAL), scanRel.getTable(), newGroupScan, scanRel.getRowType(), ((DrillScanRel) scanRel).getColumns(), true);
    } else if (scanRel instanceof EnumerableTableScan) {
        return createNewTableScanFromSelection((EnumerableTableScan) scanRel, newFiles, cacheFileRoot, wasAllPartitionsPruned, metaContext);
    } else {
        throw new UnsupportedOperationException("Only DrillScanRel and EnumerableTableScan is allowed!");
    }
}
Also used : FileSelection(org.apache.drill.exec.store.dfs.FileSelection) DrillScanRel(org.apache.drill.exec.planner.logical.DrillScanRel) FileGroupScan(org.apache.drill.exec.physical.base.FileGroupScan) EnumerableTableScan(org.apache.calcite.adapter.enumerable.EnumerableTableScan) DirPrunedEnumerableTableScan(org.apache.drill.exec.planner.logical.DirPrunedEnumerableTableScan) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection)

Example 2 with FileSelection

use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.

the class ParquetPartitionDescriptor method createNewGroupScan.

private GroupScan createNewGroupScan(List<String> newFiles, String cacheFileRoot, boolean wasAllPartitionsPruned, MetadataContext metaContext) throws IOException {
    final FileSelection newSelection = FileSelection.create(null, newFiles, getBaseTableLocation(), cacheFileRoot, wasAllPartitionsPruned);
    newSelection.setMetaContext(metaContext);
    final FileGroupScan newScan = ((FileGroupScan) scanRel.getGroupScan()).clone(newSelection);
    return newScan;
}
Also used : FileSelection(org.apache.drill.exec.store.dfs.FileSelection) FileGroupScan(org.apache.drill.exec.physical.base.FileGroupScan)

Example 3 with FileSelection

use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.

the class ParquetGroupScan method applyFilter.

public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
    if (fileSet.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
        //    -  # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
        return null;
    }
    final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
    final List<RowGroupMetadata> qualifiedRGs = new ArrayList<>(parquetTableMetadata.getFiles().size());
    // HashSet keeps a fileName unique.
    Set<String> qualifiedFileNames = Sets.newHashSet();
    ParquetFilterPredicate filterPredicate = null;
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        final ImplicitColumnExplorer columnExplorer = new ImplicitColumnExplorer(optionManager, this.columns);
        Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(file.getPath(), selectionRoot);
        for (RowGroupMetadata rowGroup : file.getRowGroups()) {
            ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
            Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
            if (filterPredicate == null) {
                ErrorCollector errorCollector = new ErrorCollectorImpl();
                LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
                if (errorCollector.hasErrors()) {
                    logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
                    return null;
                }
                //    logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
                Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
                filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
                if (filterPredicate == null) {
                    return null;
                }
            }
            if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
                continue;
            }
            qualifiedRGs.add(rowGroup);
            // TODO : optimize when 1 file contains m row groups.
            qualifiedFileNames.add(file.getPath());
        }
    }
    if (qualifiedFileNames.size() == fileSet.size()) {
        // There is no reduction of rowGroups. Return the original groupScan.
        logger.debug("applyFilter does not have any pruning!");
        return null;
    } else if (qualifiedFileNames.size() == 0) {
        logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
        qualifiedFileNames.add(fileSet.iterator().next());
    }
    try {
        FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
        logger.info("applyFilter {} reduce parquet file # from {} to {}", ExpressionStringBuilder.toString(filterExpr), fileSet.size(), qualifiedFileNames.size());
        return this.clone(newSelection);
    } catch (IOException e) {
        logger.warn("Could not apply filter prune due to Exception : {}", e);
        return null;
    }
}
Also used : ImplicitColumnExplorer(org.apache.drill.exec.store.ImplicitColumnExplorer) ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) ArrayList(java.util.ArrayList) ErrorCollector(org.apache.drill.common.expression.ErrorCollector) IOException(java.io.IOException) RowGroupMetadata(org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata) ErrorCollectorImpl(org.apache.drill.common.expression.ErrorCollectorImpl) LogicalExpression(org.apache.drill.common.expression.LogicalExpression) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFilterPredicate(org.apache.drill.exec.expr.stat.ParquetFilterPredicate) ParquetMetaStatCollector(org.apache.drill.exec.store.parquet.stat.ParquetMetaStatCollector)

Example 4 with FileSelection

use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.

the class ParquetGroupScan method expandIfNecessary.

/**
   * expands the selection's folders if metadata cache is found for the selection root.<br>
   * If the selection has already been expanded or no metadata cache was found, does nothing
   *
   * @param selection actual selection before expansion
   * @return new selection after expansion, if no expansion was done returns the input selection
   *
   * @throws IOException
   */
private FileSelection expandIfNecessary(FileSelection selection) throws IOException {
    if (selection.isExpandedFully()) {
        return selection;
    }
    // use the cacheFileRoot if provided (e.g after partition pruning)
    Path metaFilePath = new Path(cacheFileRoot != null ? cacheFileRoot : selectionRoot, Metadata.METADATA_FILENAME);
    if (!fs.exists(metaFilePath)) {
        // no metadata cache
        return selection;
    }
    FileSelection expandedSelection = initFromMetadataCache(selection, metaFilePath);
    return expandedSelection;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) FileSelection(org.apache.drill.exec.store.dfs.FileSelection)

Example 5 with FileSelection

use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.

the class ParquetGroupScan method applyLimit.

@Override
public GroupScan applyLimit(long maxRecords) {
    Preconditions.checkArgument(rowGroupInfos.size() >= 0);
    // Make sure it request at least 1 row -> 1 rowGroup.
    maxRecords = Math.max(maxRecords, 1);
    // further optimization : minimize # of files chosen, or the affinity of files chosen.
    // Calculate number of rowGroups to read based on maxRecords and update
    // number of records to read for each of those rowGroups.
    int index = updateRowGroupInfo(maxRecords);
    // HashSet keeps a fileName unique.
    Set<String> fileNames = Sets.newHashSet();
    for (RowGroupInfo rowGroupInfo : rowGroupInfos.subList(0, index)) {
        fileNames.add(rowGroupInfo.getPath());
    }
    // If there is no change in fileSet, no need to create new groupScan.
    if (fileNames.size() == fileSet.size()) {
        // There is no reduction of rowGroups. Return the original groupScan.
        logger.debug("applyLimit() does not apply!");
        return null;
    }
    try {
        FileSelection newSelection = new FileSelection(null, Lists.newArrayList(fileNames), getSelectionRoot(), cacheFileRoot, false);
        logger.debug("applyLimit() reduce parquet file # from {} to {}", fileSet.size(), fileNames.size());
        return this.clone(newSelection, maxRecords);
    } catch (IOException e) {
        logger.warn("Could not apply rowcount based prune due to Exception : {}", e);
        return null;
    }
}
Also used : FileSelection(org.apache.drill.exec.store.dfs.FileSelection) IOException(java.io.IOException) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint)

Aggregations

FileSelection (org.apache.drill.exec.store.dfs.FileSelection)8 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 FormatSelection (org.apache.drill.exec.store.dfs.FormatSelection)3 IOException (java.io.IOException)2 EnumerableTableScan (org.apache.calcite.adapter.enumerable.EnumerableTableScan)2 FileGroupScan (org.apache.drill.exec.physical.base.FileGroupScan)2 DirPrunedEnumerableTableScan (org.apache.drill.exec.planner.logical.DirPrunedEnumerableTableScan)2 DrillScanRel (org.apache.drill.exec.planner.logical.DrillScanRel)2 ReadEntryWithPath (org.apache.drill.exec.store.dfs.ReadEntryWithPath)2 ParquetFileMetadata (org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata)2 RowGroupMetadata (org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata)2 Path (org.apache.hadoop.fs.Path)2 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 RelOptTableImpl (org.apache.calcite.prepare.RelOptTableImpl)1 ErrorCollector (org.apache.drill.common.expression.ErrorCollector)1 ErrorCollectorImpl (org.apache.drill.common.expression.ErrorCollectorImpl)1 LogicalExpression (org.apache.drill.common.expression.LogicalExpression)1 ParquetFilterPredicate (org.apache.drill.exec.expr.stat.ParquetFilterPredicate)1 DrillTranslatableTable (org.apache.drill.exec.planner.logical.DrillTranslatableTable)1