use of org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata in project drill by apache.
the class ParquetGroupScan method applyFilter.
public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
if (fileSet.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
// - # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
return null;
}
final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
final List<RowGroupMetadata> qualifiedRGs = new ArrayList<>(parquetTableMetadata.getFiles().size());
// HashSet keeps a fileName unique.
Set<String> qualifiedFileNames = Sets.newHashSet();
ParquetFilterPredicate filterPredicate = null;
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
final ImplicitColumnExplorer columnExplorer = new ImplicitColumnExplorer(optionManager, this.columns);
Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(file.getPath(), selectionRoot);
for (RowGroupMetadata rowGroup : file.getRowGroups()) {
ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
if (filterPredicate == null) {
ErrorCollector errorCollector = new ErrorCollectorImpl();
LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
if (errorCollector.hasErrors()) {
logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
return null;
}
// logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
if (filterPredicate == null) {
return null;
}
}
if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
continue;
}
qualifiedRGs.add(rowGroup);
// TODO : optimize when 1 file contains m row groups.
qualifiedFileNames.add(file.getPath());
}
}
if (qualifiedFileNames.size() == fileSet.size()) {
// There is no reduction of rowGroups. Return the original groupScan.
logger.debug("applyFilter does not have any pruning!");
return null;
} else if (qualifiedFileNames.size() == 0) {
logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
qualifiedFileNames.add(fileSet.iterator().next());
}
try {
FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
logger.info("applyFilter {} reduce parquet file # from {} to {}", ExpressionStringBuilder.toString(filterExpr), fileSet.size(), qualifiedFileNames.size());
return this.clone(newSelection);
} catch (IOException e) {
logger.warn("Could not apply filter prune due to Exception : {}", e);
return null;
}
}
use of org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata in project drill by apache.
the class ParquetGroupScan method removeUnneededRowGroups.
private ParquetTableMetadataBase removeUnneededRowGroups(ParquetTableMetadataBase parquetTableMetadata) {
List<ParquetFileMetadata> newFileMetadataList = Lists.newArrayList();
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
if (fileSet.contains(file.getPath())) {
newFileMetadataList.add(file);
}
}
ParquetTableMetadataBase metadata = parquetTableMetadata.clone();
metadata.assignFiles(newFileMetadataList);
return metadata;
}
use of org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata in project drill by apache.
the class ParquetGroupScan method init.
private void init(MetadataContext metaContext) throws IOException {
if (entries.size() == 1 && parquetTableMetadata == null) {
Path p = Path.getPathWithoutSchemeAndAuthority(new Path(entries.get(0).getPath()));
Path metaPath = null;
if (fs.isDirectory(p)) {
// Using the metadata file makes sense when querying a directory; otherwise
// if querying a single file we can look up the metadata directly from the file
metaPath = new Path(p, Metadata.METADATA_FILENAME);
}
if (metaPath != null && fs.exists(metaPath)) {
usedMetadataCache = true;
parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext, formatConfig);
} else {
parquetTableMetadata = Metadata.getParquetTableMetadata(fs, p.toString(), formatConfig);
}
} else {
Path p = Path.getPathWithoutSchemeAndAuthority(new Path(selectionRoot));
Path metaPath = new Path(p, Metadata.METADATA_FILENAME);
if (fs.isDirectory(new Path(selectionRoot)) && fs.exists(metaPath)) {
usedMetadataCache = true;
if (parquetTableMetadata == null) {
parquetTableMetadata = Metadata.readBlockMeta(fs, metaPath.toString(), metaContext, formatConfig);
}
if (fileSet != null) {
parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata);
}
} else {
final List<FileStatus> fileStatuses = Lists.newArrayList();
for (ReadEntryWithPath entry : entries) {
getFiles(entry.getPath(), fileStatuses);
}
parquetTableMetadata = Metadata.getParquetTableMetadata(fs, fileStatuses, formatConfig);
}
}
if (fileSet == null) {
fileSet = Sets.newHashSet();
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
fileSet.add(file.getPath());
}
}
Map<String, DrillbitEndpoint> hostEndpointMap = Maps.newHashMap();
for (DrillbitEndpoint endpoint : formatPlugin.getContext().getBits()) {
hostEndpointMap.put(endpoint.getAddress(), endpoint);
}
rowGroupInfos = Lists.newArrayList();
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
int rgIndex = 0;
for (RowGroupMetadata rg : file.getRowGroups()) {
RowGroupInfo rowGroupInfo = new RowGroupInfo(file.getPath(), rg.getStart(), rg.getLength(), rgIndex, rg.getRowCount());
EndpointByteMap endpointByteMap = new EndpointByteMapImpl();
for (String host : rg.getHostAffinity().keySet()) {
if (hostEndpointMap.containsKey(host)) {
endpointByteMap.add(hostEndpointMap.get(host), (long) (rg.getHostAffinity().get(host) * rg.getLength()));
}
}
rowGroupInfo.setEndpointByteMap(endpointByteMap);
rgIndex++;
rowGroupInfos.add(rowGroupInfo);
}
}
this.endpointAffinities = AffinityCreator.getAffinityMap(rowGroupInfos);
columnValueCounts = Maps.newHashMap();
this.rowCount = 0;
boolean first = true;
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
for (RowGroupMetadata rowGroup : file.getRowGroups()) {
long rowCount = rowGroup.getRowCount();
for (ColumnMetadata column : rowGroup.getColumns()) {
SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getName());
Long previousCount = columnValueCounts.get(schemaPath);
if (previousCount != null) {
if (previousCount != GroupScan.NO_COLUMN_STATS) {
if (column.getNulls() != null) {
Long newCount = rowCount - column.getNulls();
columnValueCounts.put(schemaPath, columnValueCounts.get(schemaPath) + newCount);
}
}
} else {
if (column.getNulls() != null) {
Long newCount = rowCount - column.getNulls();
columnValueCounts.put(schemaPath, newCount);
} else {
columnValueCounts.put(schemaPath, GroupScan.NO_COLUMN_STATS);
}
}
boolean partitionColumn = checkForPartitionColumn(column, first);
if (partitionColumn) {
Map<SchemaPath, Object> map = partitionValueMap.get(file.getPath());
if (map == null) {
map = Maps.newHashMap();
partitionValueMap.put(file.getPath(), map);
}
Object value = map.get(schemaPath);
Object currentValue = column.getMaxValue();
if (value != null) {
if (value != currentValue) {
partitionColTypeMap.remove(schemaPath);
}
} else {
map.put(schemaPath, currentValue);
}
} else {
partitionColTypeMap.remove(schemaPath);
}
}
this.rowCount += rowGroup.getRowCount();
first = false;
}
}
}
Aggregations