use of org.apache.drill.exec.store.parquet.stat.ColumnStatistics in project drill by apache.
the class RangeExprEvaluator method visitUnknown.
@Override
public Statistics visitUnknown(LogicalExpression e, Void value) throws RuntimeException {
if (e instanceof TypedFieldExpr) {
TypedFieldExpr fieldExpr = (TypedFieldExpr) e;
final ColumnStatistics columnStatistics = columnStatMap.get(fieldExpr.getPath());
if (columnStatistics != null) {
return columnStatistics.getStatistics();
} else {
// field does not exist.
Preconditions.checkArgument(fieldExpr.getMajorType().equals(Types.OPTIONAL_INT));
IntStatistics intStatistics = new IntStatistics();
// all values are nulls
intStatistics.setNumNulls(rowCount);
return intStatistics;
}
}
return null;
}
use of org.apache.drill.exec.store.parquet.stat.ColumnStatistics in project drill by apache.
the class ParquetGroupScan method applyFilter.
public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
if (fileSet.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
// - # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
return null;
}
final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
final List<RowGroupMetadata> qualifiedRGs = new ArrayList<>(parquetTableMetadata.getFiles().size());
// HashSet keeps a fileName unique.
Set<String> qualifiedFileNames = Sets.newHashSet();
ParquetFilterPredicate filterPredicate = null;
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
final ImplicitColumnExplorer columnExplorer = new ImplicitColumnExplorer(optionManager, this.columns);
Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(file.getPath(), selectionRoot);
for (RowGroupMetadata rowGroup : file.getRowGroups()) {
ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
if (filterPredicate == null) {
ErrorCollector errorCollector = new ErrorCollectorImpl();
LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
if (errorCollector.hasErrors()) {
logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
return null;
}
// logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
if (filterPredicate == null) {
return null;
}
}
if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
continue;
}
qualifiedRGs.add(rowGroup);
// TODO : optimize when 1 file contains m row groups.
qualifiedFileNames.add(file.getPath());
}
}
if (qualifiedFileNames.size() == fileSet.size()) {
// There is no reduction of rowGroups. Return the original groupScan.
logger.debug("applyFilter does not have any pruning!");
return null;
} else if (qualifiedFileNames.size() == 0) {
logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
qualifiedFileNames.add(fileSet.iterator().next());
}
try {
FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
logger.info("applyFilter {} reduce parquet file # from {} to {}", ExpressionStringBuilder.toString(filterExpr), fileSet.size(), qualifiedFileNames.size());
return this.clone(newSelection);
} catch (IOException e) {
logger.warn("Could not apply filter prune due to Exception : {}", e);
return null;
}
}
use of org.apache.drill.exec.store.parquet.stat.ColumnStatistics in project drill by apache.
the class ParquetRGFilterEvaluator method evalFilter.
public static boolean evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext, Map<String, String> implicitColValues) {
// figure out the set of columns referenced in expression.
final Set<SchemaPath> schemaPathsInExpr = expr.accept(new FieldReferenceFinder(), null);
final ColumnStatCollector columnStatCollector = new ParquetFooterStatCollector(footer, rowGroupIndex, implicitColValues, true, options);
Map<SchemaPath, ColumnStatistics> columnStatisticsMap = columnStatCollector.collectColStat(schemaPathsInExpr);
boolean canDrop = canDrop(expr, columnStatisticsMap, footer.getBlocks().get(rowGroupIndex).getRowCount(), fragmentContext, fragmentContext.getFunctionRegistry());
return canDrop;
}
use of org.apache.drill.exec.store.parquet.stat.ColumnStatistics in project drill by axbaretto.
the class ParquetGroupScan method applyFilter.
public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
if (rowGroupInfos.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
// - # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
return null;
}
final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
final List<RowGroupInfo> qualifiedRGs = new ArrayList<>(rowGroupInfos.size());
// HashSet keeps a fileName unique.
Set<String> qualifiedFileNames = Sets.newHashSet();
ParquetFilterPredicate filterPredicate = null;
for (RowGroupInfo rowGroup : rowGroupInfos) {
final ColumnExplorer columnExplorer = new ColumnExplorer(optionManager, this.columns);
Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(rowGroup.getPath(), selectionRoot);
ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
if (filterPredicate == null) {
ErrorCollector errorCollector = new ErrorCollectorImpl();
LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
if (errorCollector.hasErrors()) {
logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
return null;
}
// logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
if (filterPredicate == null) {
return null;
}
}
if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
continue;
}
qualifiedRGs.add(rowGroup);
// TODO : optimize when 1 file contains m row groups.
qualifiedFileNames.add(rowGroup.getPath());
}
if (qualifiedRGs.size() == rowGroupInfos.size()) {
// There is no reduction of rowGroups. Return the original groupScan.
logger.debug("applyFilter does not have any pruning!");
return null;
} else if (qualifiedFileNames.size() == 0) {
logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
RowGroupInfo rg = rowGroupInfos.iterator().next();
qualifiedFileNames.add(rg.getPath());
qualifiedRGs.add(rg);
}
try {
FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
logger.info("applyFilter {} reduce parquet rowgroup # from {} to {}", ExpressionStringBuilder.toString(filterExpr), rowGroupInfos.size(), qualifiedRGs.size());
ParquetGroupScan clonegroupscan = this.clone(newSelection);
clonegroupscan.rowGroupInfos = qualifiedRGs;
clonegroupscan.updatePartitionColTypeMap();
return clonegroupscan;
} catch (IOException e) {
logger.warn("Could not apply filter prune due to Exception : {}", e);
return null;
}
}
use of org.apache.drill.exec.store.parquet.stat.ColumnStatistics in project drill by axbaretto.
the class ParquetRGFilterEvaluator method evalFilter.
public static boolean evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext, Map<String, String> implicitColValues) {
// figure out the set of columns referenced in expression.
final Set<SchemaPath> schemaPathsInExpr = expr.accept(new FieldReferenceFinder(), null);
final ColumnStatCollector columnStatCollector = new ParquetFooterStatCollector(footer, rowGroupIndex, implicitColValues, true, options);
Map<SchemaPath, ColumnStatistics> columnStatisticsMap = columnStatCollector.collectColStat(schemaPathsInExpr);
boolean canDrop = canDrop(expr, columnStatisticsMap, footer.getBlocks().get(rowGroupIndex).getRowCount(), fragmentContext, fragmentContext.getFunctionRegistry());
return canDrop;
}
Aggregations