Search in sources :

Example 1 with ColumnStatistics

use of org.apache.drill.exec.store.parquet.stat.ColumnStatistics in project drill by apache.

the class RangeExprEvaluator method visitUnknown.

@Override
public Statistics visitUnknown(LogicalExpression e, Void value) throws RuntimeException {
    if (e instanceof TypedFieldExpr) {
        TypedFieldExpr fieldExpr = (TypedFieldExpr) e;
        final ColumnStatistics columnStatistics = columnStatMap.get(fieldExpr.getPath());
        if (columnStatistics != null) {
            return columnStatistics.getStatistics();
        } else {
            // field does not exist.
            Preconditions.checkArgument(fieldExpr.getMajorType().equals(Types.OPTIONAL_INT));
            IntStatistics intStatistics = new IntStatistics();
            // all values are nulls
            intStatistics.setNumNulls(rowCount);
            return intStatistics;
        }
    }
    return null;
}
Also used : ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics)

Example 2 with ColumnStatistics

use of org.apache.drill.exec.store.parquet.stat.ColumnStatistics in project drill by apache.

the class ParquetGroupScan method applyFilter.

public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
    if (fileSet.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
        //    -  # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
        return null;
    }
    final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
    final List<RowGroupMetadata> qualifiedRGs = new ArrayList<>(parquetTableMetadata.getFiles().size());
    // HashSet keeps a fileName unique.
    Set<String> qualifiedFileNames = Sets.newHashSet();
    ParquetFilterPredicate filterPredicate = null;
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        final ImplicitColumnExplorer columnExplorer = new ImplicitColumnExplorer(optionManager, this.columns);
        Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(file.getPath(), selectionRoot);
        for (RowGroupMetadata rowGroup : file.getRowGroups()) {
            ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
            Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
            if (filterPredicate == null) {
                ErrorCollector errorCollector = new ErrorCollectorImpl();
                LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
                if (errorCollector.hasErrors()) {
                    logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
                    return null;
                }
                //    logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
                Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
                filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
                if (filterPredicate == null) {
                    return null;
                }
            }
            if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
                continue;
            }
            qualifiedRGs.add(rowGroup);
            // TODO : optimize when 1 file contains m row groups.
            qualifiedFileNames.add(file.getPath());
        }
    }
    if (qualifiedFileNames.size() == fileSet.size()) {
        // There is no reduction of rowGroups. Return the original groupScan.
        logger.debug("applyFilter does not have any pruning!");
        return null;
    } else if (qualifiedFileNames.size() == 0) {
        logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
        qualifiedFileNames.add(fileSet.iterator().next());
    }
    try {
        FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
        logger.info("applyFilter {} reduce parquet file # from {} to {}", ExpressionStringBuilder.toString(filterExpr), fileSet.size(), qualifiedFileNames.size());
        return this.clone(newSelection);
    } catch (IOException e) {
        logger.warn("Could not apply filter prune due to Exception : {}", e);
        return null;
    }
}
Also used : ImplicitColumnExplorer(org.apache.drill.exec.store.ImplicitColumnExplorer) ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) ArrayList(java.util.ArrayList) ErrorCollector(org.apache.drill.common.expression.ErrorCollector) IOException(java.io.IOException) RowGroupMetadata(org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata) ErrorCollectorImpl(org.apache.drill.common.expression.ErrorCollectorImpl) LogicalExpression(org.apache.drill.common.expression.LogicalExpression) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFilterPredicate(org.apache.drill.exec.expr.stat.ParquetFilterPredicate) ParquetMetaStatCollector(org.apache.drill.exec.store.parquet.stat.ParquetMetaStatCollector)

Example 3 with ColumnStatistics

use of org.apache.drill.exec.store.parquet.stat.ColumnStatistics in project drill by apache.

the class ParquetRGFilterEvaluator method evalFilter.

public static boolean evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext, Map<String, String> implicitColValues) {
    // figure out the set of columns referenced in expression.
    final Set<SchemaPath> schemaPathsInExpr = expr.accept(new FieldReferenceFinder(), null);
    final ColumnStatCollector columnStatCollector = new ParquetFooterStatCollector(footer, rowGroupIndex, implicitColValues, true, options);
    Map<SchemaPath, ColumnStatistics> columnStatisticsMap = columnStatCollector.collectColStat(schemaPathsInExpr);
    boolean canDrop = canDrop(expr, columnStatisticsMap, footer.getBlocks().get(rowGroupIndex).getRowCount(), fragmentContext, fragmentContext.getFunctionRegistry());
    return canDrop;
}
Also used : ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnStatCollector(org.apache.drill.exec.store.parquet.stat.ColumnStatCollector) ParquetFooterStatCollector(org.apache.drill.exec.store.parquet.stat.ParquetFooterStatCollector)

Example 4 with ColumnStatistics

use of org.apache.drill.exec.store.parquet.stat.ColumnStatistics in project drill by axbaretto.

the class ParquetGroupScan method applyFilter.

public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
    if (rowGroupInfos.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
        // -  # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
        return null;
    }
    final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
    final List<RowGroupInfo> qualifiedRGs = new ArrayList<>(rowGroupInfos.size());
    // HashSet keeps a fileName unique.
    Set<String> qualifiedFileNames = Sets.newHashSet();
    ParquetFilterPredicate filterPredicate = null;
    for (RowGroupInfo rowGroup : rowGroupInfos) {
        final ColumnExplorer columnExplorer = new ColumnExplorer(optionManager, this.columns);
        Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(rowGroup.getPath(), selectionRoot);
        ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
        Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
        if (filterPredicate == null) {
            ErrorCollector errorCollector = new ErrorCollectorImpl();
            LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
            if (errorCollector.hasErrors()) {
                logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
                return null;
            }
            // logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
            Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
            filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
            if (filterPredicate == null) {
                return null;
            }
        }
        if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
            continue;
        }
        qualifiedRGs.add(rowGroup);
        // TODO : optimize when 1 file contains m row groups.
        qualifiedFileNames.add(rowGroup.getPath());
    }
    if (qualifiedRGs.size() == rowGroupInfos.size()) {
        // There is no reduction of rowGroups. Return the original groupScan.
        logger.debug("applyFilter does not have any pruning!");
        return null;
    } else if (qualifiedFileNames.size() == 0) {
        logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
        RowGroupInfo rg = rowGroupInfos.iterator().next();
        qualifiedFileNames.add(rg.getPath());
        qualifiedRGs.add(rg);
    }
    try {
        FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
        logger.info("applyFilter {} reduce parquet rowgroup # from {} to {}", ExpressionStringBuilder.toString(filterExpr), rowGroupInfos.size(), qualifiedRGs.size());
        ParquetGroupScan clonegroupscan = this.clone(newSelection);
        clonegroupscan.rowGroupInfos = qualifiedRGs;
        clonegroupscan.updatePartitionColTypeMap();
        return clonegroupscan;
    } catch (IOException e) {
        logger.warn("Could not apply filter prune due to Exception : {}", e);
        return null;
    }
}
Also used : ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) ArrayList(java.util.ArrayList) ErrorCollector(org.apache.drill.common.expression.ErrorCollector) IOException(java.io.IOException) ErrorCollectorImpl(org.apache.drill.common.expression.ErrorCollectorImpl) LogicalExpression(org.apache.drill.common.expression.LogicalExpression) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFilterPredicate(org.apache.drill.exec.expr.stat.ParquetFilterPredicate) ParquetMetaStatCollector(org.apache.drill.exec.store.parquet.stat.ParquetMetaStatCollector)

Example 5 with ColumnStatistics

use of org.apache.drill.exec.store.parquet.stat.ColumnStatistics in project drill by axbaretto.

the class ParquetRGFilterEvaluator method evalFilter.

public static boolean evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext, Map<String, String> implicitColValues) {
    // figure out the set of columns referenced in expression.
    final Set<SchemaPath> schemaPathsInExpr = expr.accept(new FieldReferenceFinder(), null);
    final ColumnStatCollector columnStatCollector = new ParquetFooterStatCollector(footer, rowGroupIndex, implicitColValues, true, options);
    Map<SchemaPath, ColumnStatistics> columnStatisticsMap = columnStatCollector.collectColStat(schemaPathsInExpr);
    boolean canDrop = canDrop(expr, columnStatisticsMap, footer.getBlocks().get(rowGroupIndex).getRowCount(), fragmentContext, fragmentContext.getFunctionRegistry());
    return canDrop;
}
Also used : ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnStatCollector(org.apache.drill.exec.store.parquet.stat.ColumnStatCollector) ParquetFooterStatCollector(org.apache.drill.exec.store.parquet.stat.ParquetFooterStatCollector)

Aggregations

ColumnStatistics (org.apache.drill.exec.store.parquet.stat.ColumnStatistics)5 SchemaPath (org.apache.drill.common.expression.SchemaPath)4 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 ErrorCollector (org.apache.drill.common.expression.ErrorCollector)2 ErrorCollectorImpl (org.apache.drill.common.expression.ErrorCollectorImpl)2 LogicalExpression (org.apache.drill.common.expression.LogicalExpression)2 ParquetFilterPredicate (org.apache.drill.exec.expr.stat.ParquetFilterPredicate)2 FileSelection (org.apache.drill.exec.store.dfs.FileSelection)2 ColumnStatCollector (org.apache.drill.exec.store.parquet.stat.ColumnStatCollector)2 ParquetFooterStatCollector (org.apache.drill.exec.store.parquet.stat.ParquetFooterStatCollector)2 ParquetMetaStatCollector (org.apache.drill.exec.store.parquet.stat.ParquetMetaStatCollector)2 ColumnExplorer (org.apache.drill.exec.store.ColumnExplorer)1 ImplicitColumnExplorer (org.apache.drill.exec.store.ImplicitColumnExplorer)1 ParquetFileMetadata (org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata)1 RowGroupMetadata (org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata)1 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)1