use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.
the class FileSystemPartitionDescriptor method createTableScan.
@Override
public TableScan createTableScan(List<PartitionLocation> newPartitionLocation, String cacheFileRoot, boolean wasAllPartitionsPruned, MetadataContext metaContext) throws Exception {
List<String> newFiles = Lists.newArrayList();
for (final PartitionLocation location : newPartitionLocation) {
if (!location.isCompositePartition()) {
newFiles.add(location.getEntirePartitionLocation());
} else {
final Collection<SimplePartitionLocation> subPartitions = location.getPartitionLocationRecursive();
for (final PartitionLocation subPart : subPartitions) {
newFiles.add(subPart.getEntirePartitionLocation());
}
}
}
if (scanRel instanceof DrillScanRel) {
final FormatSelection formatSelection = (FormatSelection) table.getSelection();
final FileSelection newFileSelection = new FileSelection(null, newFiles, getBaseTableLocation(), cacheFileRoot, wasAllPartitionsPruned, formatSelection.getSelection().getDirStatus());
newFileSelection.setMetaContext(metaContext);
final FileGroupScan newGroupScan = ((FileGroupScan) ((DrillScanRel) scanRel).getGroupScan()).clone(newFileSelection);
return new DrillScanRel(scanRel.getCluster(), scanRel.getTraitSet().plus(DrillRel.DRILL_LOGICAL), scanRel.getTable(), newGroupScan, scanRel.getRowType(), ((DrillScanRel) scanRel).getColumns(), true);
} else if (scanRel instanceof EnumerableTableScan) {
return createNewTableScanFromSelection((EnumerableTableScan) scanRel, newFiles, cacheFileRoot, wasAllPartitionsPruned, metaContext);
} else {
throw new UnsupportedOperationException("Only DrillScanRel and EnumerableTableScan is allowed!");
}
}
use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.
the class ParquetPartitionDescriptor method createNewGroupScan.
private GroupScan createNewGroupScan(List<String> newFiles, String cacheFileRoot, boolean wasAllPartitionsPruned, MetadataContext metaContext) throws IOException {
final FileSelection newSelection = FileSelection.create(null, newFiles, getBaseTableLocation(), cacheFileRoot, wasAllPartitionsPruned);
newSelection.setMetaContext(metaContext);
final FileGroupScan newScan = ((FileGroupScan) scanRel.getGroupScan()).clone(newSelection);
return newScan;
}
use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.
the class ParquetGroupScan method applyFilter.
public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
if (fileSet.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
// - # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
return null;
}
final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
final List<RowGroupMetadata> qualifiedRGs = new ArrayList<>(parquetTableMetadata.getFiles().size());
// HashSet keeps a fileName unique.
Set<String> qualifiedFileNames = Sets.newHashSet();
ParquetFilterPredicate filterPredicate = null;
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
final ImplicitColumnExplorer columnExplorer = new ImplicitColumnExplorer(optionManager, this.columns);
Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(file.getPath(), selectionRoot);
for (RowGroupMetadata rowGroup : file.getRowGroups()) {
ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
if (filterPredicate == null) {
ErrorCollector errorCollector = new ErrorCollectorImpl();
LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
if (errorCollector.hasErrors()) {
logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
return null;
}
// logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
if (filterPredicate == null) {
return null;
}
}
if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
continue;
}
qualifiedRGs.add(rowGroup);
// TODO : optimize when 1 file contains m row groups.
qualifiedFileNames.add(file.getPath());
}
}
if (qualifiedFileNames.size() == fileSet.size()) {
// There is no reduction of rowGroups. Return the original groupScan.
logger.debug("applyFilter does not have any pruning!");
return null;
} else if (qualifiedFileNames.size() == 0) {
logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
qualifiedFileNames.add(fileSet.iterator().next());
}
try {
FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
logger.info("applyFilter {} reduce parquet file # from {} to {}", ExpressionStringBuilder.toString(filterExpr), fileSet.size(), qualifiedFileNames.size());
return this.clone(newSelection);
} catch (IOException e) {
logger.warn("Could not apply filter prune due to Exception : {}", e);
return null;
}
}
use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.
the class ParquetGroupScan method expandIfNecessary.
/**
* expands the selection's folders if metadata cache is found for the selection root.<br>
* If the selection has already been expanded or no metadata cache was found, does nothing
*
* @param selection actual selection before expansion
* @return new selection after expansion, if no expansion was done returns the input selection
*
* @throws IOException
*/
private FileSelection expandIfNecessary(FileSelection selection) throws IOException {
if (selection.isExpandedFully()) {
return selection;
}
// use the cacheFileRoot if provided (e.g after partition pruning)
Path metaFilePath = new Path(cacheFileRoot != null ? cacheFileRoot : selectionRoot, Metadata.METADATA_FILENAME);
if (!fs.exists(metaFilePath)) {
// no metadata cache
return selection;
}
FileSelection expandedSelection = initFromMetadataCache(selection, metaFilePath);
return expandedSelection;
}
use of org.apache.drill.exec.store.dfs.FileSelection in project drill by apache.
the class ParquetGroupScan method applyLimit.
@Override
public GroupScan applyLimit(long maxRecords) {
Preconditions.checkArgument(rowGroupInfos.size() >= 0);
// Make sure it request at least 1 row -> 1 rowGroup.
maxRecords = Math.max(maxRecords, 1);
// further optimization : minimize # of files chosen, or the affinity of files chosen.
// Calculate number of rowGroups to read based on maxRecords and update
// number of records to read for each of those rowGroups.
int index = updateRowGroupInfo(maxRecords);
// HashSet keeps a fileName unique.
Set<String> fileNames = Sets.newHashSet();
for (RowGroupInfo rowGroupInfo : rowGroupInfos.subList(0, index)) {
fileNames.add(rowGroupInfo.getPath());
}
// If there is no change in fileSet, no need to create new groupScan.
if (fileNames.size() == fileSet.size()) {
// There is no reduction of rowGroups. Return the original groupScan.
logger.debug("applyLimit() does not apply!");
return null;
}
try {
FileSelection newSelection = new FileSelection(null, Lists.newArrayList(fileNames), getSelectionRoot(), cacheFileRoot, false);
logger.debug("applyLimit() reduce parquet file # from {} to {}", fileSet.size(), fileNames.size());
return this.clone(newSelection, maxRecords);
} catch (IOException e) {
logger.warn("Could not apply rowcount based prune due to Exception : {}", e);
return null;
}
}
Aggregations