use of org.apache.drill.common.expression.SchemaPath in project drill by apache.
the class ProjectRecordBatch method isClassificationNeeded.
private boolean isClassificationNeeded(final List<NamedExpression> exprs) {
boolean needed = false;
for (int i = 0; i < exprs.size(); i++) {
final NamedExpression ex = exprs.get(i);
if (!(ex.getExpr() instanceof SchemaPath)) {
continue;
}
final NameSegment expr = ((SchemaPath) ex.getExpr()).getRootSegment();
final NameSegment ref = ex.getRef().getRootSegment();
final boolean refHasPrefix = ref.getPath().contains(StarColumnHelper.PREFIX_DELIMITER);
final boolean exprContainsStar = expr.getPath().contains(StarColumnHelper.STAR_COLUMN);
if (refHasPrefix || exprContainsStar) {
needed = true;
break;
}
}
return needed;
}
use of org.apache.drill.common.expression.SchemaPath in project drill by apache.
the class ParquetGroupScan method applyFilter.
public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
if (fileSet.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
// - # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
return null;
}
final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
final List<RowGroupMetadata> qualifiedRGs = new ArrayList<>(parquetTableMetadata.getFiles().size());
// HashSet keeps a fileName unique.
Set<String> qualifiedFileNames = Sets.newHashSet();
ParquetFilterPredicate filterPredicate = null;
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
final ImplicitColumnExplorer columnExplorer = new ImplicitColumnExplorer(optionManager, this.columns);
Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(file.getPath(), selectionRoot);
for (RowGroupMetadata rowGroup : file.getRowGroups()) {
ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
if (filterPredicate == null) {
ErrorCollector errorCollector = new ErrorCollectorImpl();
LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
if (errorCollector.hasErrors()) {
logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
return null;
}
// logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
if (filterPredicate == null) {
return null;
}
}
if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
continue;
}
qualifiedRGs.add(rowGroup);
// TODO : optimize when 1 file contains m row groups.
qualifiedFileNames.add(file.getPath());
}
}
if (qualifiedFileNames.size() == fileSet.size()) {
// There is no reduction of rowGroups. Return the original groupScan.
logger.debug("applyFilter does not have any pruning!");
return null;
} else if (qualifiedFileNames.size() == 0) {
logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
qualifiedFileNames.add(fileSet.iterator().next());
}
try {
FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
logger.info("applyFilter {} reduce parquet file # from {} to {}", ExpressionStringBuilder.toString(filterExpr), fileSet.size(), qualifiedFileNames.size());
return this.clone(newSelection);
} catch (IOException e) {
logger.warn("Could not apply filter prune due to Exception : {}", e);
return null;
}
}
use of org.apache.drill.common.expression.SchemaPath in project drill by apache.
the class Metadata method getParquetFileMetadata_v3.
/**
* Get the metadata for a single file
*
* @param file
* @return
* @throws IOException
*/
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, FileStatus file) throws IOException {
ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
MessageType schema = metadata.getFileMetaData().getSchema();
// Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
Map<SchemaPath, ColTypeInfo> colTypeInfoMap = Maps.newHashMap();
schema.getPaths();
for (String[] path : schema.getPaths()) {
colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
}
List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();
ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
if (logger.isDebugEnabled()) {
logger.debug(containsCorruptDates.toString());
}
for (BlockMetaData rowGroup : metadata.getBlocks()) {
List<ColumnMetadata_v3> columnMetadataList = Lists.newArrayList();
long length = 0;
for (ColumnChunkMetaData col : rowGroup.getColumns()) {
ColumnMetadata_v3 columnMetadata;
boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
Statistics<?> stats = col.getStatistics();
String[] columnName = col.getPath().toArray();
SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);
ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getType(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel);
if (parquetTableMetadata.columnTypeInfo == null) {
parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
}
// Save the column schema info. We'll merge it into one list
parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata);
if (statsAvailable) {
// Write stats when they are not null
Object minValue = null;
Object maxValue = null;
if (stats.genericGetMax() != null && stats.genericGetMin() != null) {
minValue = stats.genericGetMin();
maxValue = stats.genericGetMax();
if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) {
minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
}
}
columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), minValue, maxValue, stats.getNumNulls());
} else {
columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), null, null, null);
}
columnMetadataList.add(columnMetadata);
length += col.getTotalSize();
}
// Note we still read the schema even if there are no values in the RowGroup
if (rowGroup.getRowCount() == 0) {
continue;
}
RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
rowGroupMetadataList.add(rowGroupMeta);
}
String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}
use of org.apache.drill.common.expression.SchemaPath in project drill by apache.
the class ParquetRGFilterEvaluator method evalFilter.
public static boolean evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext, Map<String, String> implicitColValues) {
// figure out the set of columns referenced in expression.
final Set<SchemaPath> schemaPathsInExpr = expr.accept(new FieldReferenceFinder(), null);
final ColumnStatCollector columnStatCollector = new ParquetFooterStatCollector(footer, rowGroupIndex, implicitColValues, true, options);
Map<SchemaPath, ColumnStatistics> columnStatisticsMap = columnStatCollector.collectColStat(schemaPathsInExpr);
boolean canDrop = canDrop(expr, columnStatisticsMap, footer.getBlocks().get(rowGroupIndex).getRowCount(), fragmentContext, fragmentContext.getFunctionRegistry());
return canDrop;
}
use of org.apache.drill.common.expression.SchemaPath in project drill by apache.
the class MockGroupScanPOP method clone.
@Override
public GroupScan clone(List<SchemaPath> columns) {
if (columns.isEmpty()) {
throw new IllegalArgumentException("No columns for mock scan");
}
List<MockColumn> mockCols = new ArrayList<>();
Pattern p = Pattern.compile("(\\w+)_([isdb])(\\d*)");
for (SchemaPath path : columns) {
String col = path.getLastSegment().getNameSegment().getPath();
if (col.equals("*")) {
return this;
}
Matcher m = p.matcher(col);
if (!m.matches()) {
throw new IllegalArgumentException("Badly formatted mock column name: " + col);
}
@SuppressWarnings("unused") String name = m.group(1);
String type = m.group(2);
String length = m.group(3);
int width = 10;
if (!length.isEmpty()) {
width = Integer.parseInt(length);
}
MinorType minorType;
switch(type) {
case "i":
minorType = MinorType.INT;
break;
case "s":
minorType = MinorType.VARCHAR;
break;
case "d":
minorType = MinorType.FLOAT8;
break;
case "b":
minorType = MinorType.BIT;
break;
default:
throw new IllegalArgumentException("Unsupported field type " + type + " for mock column " + col);
}
MockTableDef.MockColumn mockCol = new MockColumn(col, minorType, DataMode.REQUIRED, width, 0, 0, null, 1, null);
mockCols.add(mockCol);
}
MockScanEntry entry = readEntries.get(0);
MockColumn[] types = new MockColumn[mockCols.size()];
mockCols.toArray(types);
MockScanEntry newEntry = new MockScanEntry(entry.records, true, 0, 1, types);
List<MockScanEntry> newEntries = new ArrayList<>();
newEntries.add(newEntry);
return new MockGroupScanPOP(url, newEntries);
}
Aggregations