use of org.apache.drill.common.expression.SchemaPath in project drill by apache.
the class ProjectRecordBatch method classifyExpr.
private void classifyExpr(final NamedExpression ex, final RecordBatch incoming, final ClassifierResult result) {
final NameSegment expr = ((SchemaPath) ex.getExpr()).getRootSegment();
final NameSegment ref = ex.getRef().getRootSegment();
final boolean exprHasPrefix = expr.getPath().contains(StarColumnHelper.PREFIX_DELIMITER);
final boolean refHasPrefix = ref.getPath().contains(StarColumnHelper.PREFIX_DELIMITER);
final boolean exprIsStar = expr.getPath().equals(StarColumnHelper.STAR_COLUMN);
final boolean refContainsStar = ref.getPath().contains(StarColumnHelper.STAR_COLUMN);
final boolean exprContainsStar = expr.getPath().contains(StarColumnHelper.STAR_COLUMN);
final boolean refEndsWithStar = ref.getPath().endsWith(StarColumnHelper.STAR_COLUMN);
String exprPrefix = EMPTY_STRING;
String exprSuffix = expr.getPath();
if (exprHasPrefix) {
// get the prefix of the expr
final String[] exprComponents = expr.getPath().split(StarColumnHelper.PREFIX_DELIMITER, 2);
assert (exprComponents.length == 2);
exprPrefix = exprComponents[0];
exprSuffix = exprComponents[1];
result.prefix = exprPrefix;
}
boolean exprIsFirstWildcard = false;
if (exprContainsStar) {
result.isStar = true;
final Integer value = (Integer) result.prefixMap.get(exprPrefix);
if (value == null) {
final Integer n = 1;
result.prefixMap.put(exprPrefix, n);
exprIsFirstWildcard = true;
} else {
final Integer n = value + 1;
result.prefixMap.put(exprPrefix, n);
}
}
final int incomingSchemaSize = incoming.getSchema().getFieldCount();
// input is '*' and output is 'prefix_*'
if (exprIsStar && refHasPrefix && refEndsWithStar) {
final String[] components = ref.getPath().split(StarColumnHelper.PREFIX_DELIMITER, 2);
assert (components.length == 2);
final String prefix = components[0];
result.outputNames = Lists.newArrayList();
for (final VectorWrapper<?> wrapper : incoming) {
final ValueVector vvIn = wrapper.getValueVector();
final String name = vvIn.getField().getPath();
// add the prefix to the incoming column name
final String newName = prefix + StarColumnHelper.PREFIX_DELIMITER + name;
addToResultMaps(newName, result, false);
}
} else // input and output are the same
if (expr.getPath().equalsIgnoreCase(ref.getPath()) && (!exprContainsStar || exprIsFirstWildcard)) {
if (exprContainsStar && exprHasPrefix) {
assert exprPrefix != null;
int k = 0;
result.outputNames = Lists.newArrayListWithCapacity(incomingSchemaSize);
for (int j = 0; j < incomingSchemaSize; j++) {
// initialize
result.outputNames.add(EMPTY_STRING);
}
for (final VectorWrapper<?> wrapper : incoming) {
final ValueVector vvIn = wrapper.getValueVector();
final String incomingName = vvIn.getField().getPath();
// get the prefix of the name
final String[] nameComponents = incomingName.split(StarColumnHelper.PREFIX_DELIMITER, 2);
// if incoming valuevector does not have a prefix, ignore it since this expression is not referencing it
if (nameComponents.length <= 1) {
k++;
continue;
}
final String namePrefix = nameComponents[0];
if (exprPrefix.equalsIgnoreCase(namePrefix)) {
final String newName = incomingName;
if (!result.outputMap.containsKey(newName)) {
result.outputNames.set(k, newName);
result.outputMap.put(newName, newName);
}
}
k++;
}
} else {
result.outputNames = Lists.newArrayList();
if (exprContainsStar) {
for (final VectorWrapper<?> wrapper : incoming) {
final ValueVector vvIn = wrapper.getValueVector();
final String incomingName = vvIn.getField().getPath();
if (refContainsStar) {
// allow dups since this is likely top-level project
addToResultMaps(incomingName, result, true);
} else {
addToResultMaps(incomingName, result, false);
}
}
} else {
final String newName = expr.getPath();
if (!refHasPrefix && !exprHasPrefix) {
// allow dups since this is likely top-level project
addToResultMaps(newName, result, true);
} else {
addToResultMaps(newName, result, false);
}
}
}
} else // input is wildcard and it is not the first wildcard
if (exprIsStar) {
result.outputNames = Lists.newArrayList();
for (final VectorWrapper<?> wrapper : incoming) {
final ValueVector vvIn = wrapper.getValueVector();
final String incomingName = vvIn.getField().getPath();
// allow dups since this is likely top-level project
addToResultMaps(incomingName, result, true);
}
} else // only the output has prefix
if (!exprHasPrefix && refHasPrefix) {
result.outputNames = Lists.newArrayList();
final String newName = ref.getPath();
addToResultMaps(newName, result, false);
} else // input has prefix but output does not
if (exprHasPrefix && !refHasPrefix) {
int k = 0;
result.outputNames = Lists.newArrayListWithCapacity(incomingSchemaSize);
for (int j = 0; j < incomingSchemaSize; j++) {
// initialize
result.outputNames.add(EMPTY_STRING);
}
for (final VectorWrapper<?> wrapper : incoming) {
final ValueVector vvIn = wrapper.getValueVector();
final String name = vvIn.getField().getPath();
final String[] components = name.split(StarColumnHelper.PREFIX_DELIMITER, 2);
if (components.length <= 1) {
k++;
continue;
}
final String namePrefix = components[0];
final String nameSuffix = components[1];
if (exprPrefix.equalsIgnoreCase(namePrefix)) {
// // case insensitive matching of prefix.
if (refContainsStar) {
// remove the prefix from the incoming column names
// for top level we need to make names unique
final String newName = getUniqueName(nameSuffix, result);
result.outputNames.set(k, newName);
} else if (exprSuffix.equalsIgnoreCase(nameSuffix)) {
// case insensitive matching of field name.
// example: ref: $f1, expr: T0<PREFIX><column_name>
final String newName = ref.getPath();
result.outputNames.set(k, newName);
}
} else {
result.outputNames.add(EMPTY_STRING);
}
k++;
}
} else // input and output have prefixes although they could be different...
if (exprHasPrefix && refHasPrefix) {
final String[] input = expr.getPath().split(StarColumnHelper.PREFIX_DELIMITER, 2);
assert (input.length == 2);
// not handled yet
assert false : "Unexpected project expression or reference";
} else {
// if the incoming schema's column name matches the expression name of the Project,
// then we just want to pick the ref name as the output column name
result.outputNames = Lists.newArrayList();
for (final VectorWrapper<?> wrapper : incoming) {
final ValueVector vvIn = wrapper.getValueVector();
final String incomingName = vvIn.getField().getPath();
if (expr.getPath().equalsIgnoreCase(incomingName)) {
// case insensitive matching of field name.
final String newName = ref.getPath();
addToResultMaps(newName, result, true);
}
}
}
}
use of org.apache.drill.common.expression.SchemaPath in project drill by apache.
the class ProjectRecordBatch method isClassificationNeeded.
private boolean isClassificationNeeded(final List<NamedExpression> exprs) {
boolean needed = false;
for (int i = 0; i < exprs.size(); i++) {
final NamedExpression ex = exprs.get(i);
if (!(ex.getExpr() instanceof SchemaPath)) {
continue;
}
final NameSegment expr = ((SchemaPath) ex.getExpr()).getRootSegment();
final NameSegment ref = ex.getRef().getRootSegment();
final boolean refHasPrefix = ref.getPath().contains(StarColumnHelper.PREFIX_DELIMITER);
final boolean exprContainsStar = expr.getPath().contains(StarColumnHelper.STAR_COLUMN);
if (refHasPrefix || exprContainsStar) {
needed = true;
break;
}
}
return needed;
}
use of org.apache.drill.common.expression.SchemaPath in project drill by apache.
the class ParquetGroupScan method applyFilter.
public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
if (fileSet.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
// - # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
return null;
}
final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
final List<RowGroupMetadata> qualifiedRGs = new ArrayList<>(parquetTableMetadata.getFiles().size());
// HashSet keeps a fileName unique.
Set<String> qualifiedFileNames = Sets.newHashSet();
ParquetFilterPredicate filterPredicate = null;
for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
final ImplicitColumnExplorer columnExplorer = new ImplicitColumnExplorer(optionManager, this.columns);
Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(file.getPath(), selectionRoot);
for (RowGroupMetadata rowGroup : file.getRowGroups()) {
ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
if (filterPredicate == null) {
ErrorCollector errorCollector = new ErrorCollectorImpl();
LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
if (errorCollector.hasErrors()) {
logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
return null;
}
// logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
if (filterPredicate == null) {
return null;
}
}
if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
continue;
}
qualifiedRGs.add(rowGroup);
// TODO : optimize when 1 file contains m row groups.
qualifiedFileNames.add(file.getPath());
}
}
if (qualifiedFileNames.size() == fileSet.size()) {
// There is no reduction of rowGroups. Return the original groupScan.
logger.debug("applyFilter does not have any pruning!");
return null;
} else if (qualifiedFileNames.size() == 0) {
logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
qualifiedFileNames.add(fileSet.iterator().next());
}
try {
FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
logger.info("applyFilter {} reduce parquet file # from {} to {}", ExpressionStringBuilder.toString(filterExpr), fileSet.size(), qualifiedFileNames.size());
return this.clone(newSelection);
} catch (IOException e) {
logger.warn("Could not apply filter prune due to Exception : {}", e);
return null;
}
}
use of org.apache.drill.common.expression.SchemaPath in project drill by apache.
the class Metadata method getParquetFileMetadata_v3.
/**
* Get the metadata for a single file
*
* @param file
* @return
* @throws IOException
*/
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, FileStatus file) throws IOException {
ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
MessageType schema = metadata.getFileMetaData().getSchema();
// Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
Map<SchemaPath, ColTypeInfo> colTypeInfoMap = Maps.newHashMap();
schema.getPaths();
for (String[] path : schema.getPaths()) {
colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
}
List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();
ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
if (logger.isDebugEnabled()) {
logger.debug(containsCorruptDates.toString());
}
for (BlockMetaData rowGroup : metadata.getBlocks()) {
List<ColumnMetadata_v3> columnMetadataList = Lists.newArrayList();
long length = 0;
for (ColumnChunkMetaData col : rowGroup.getColumns()) {
ColumnMetadata_v3 columnMetadata;
boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
Statistics<?> stats = col.getStatistics();
String[] columnName = col.getPath().toArray();
SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);
ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getType(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel);
if (parquetTableMetadata.columnTypeInfo == null) {
parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
}
// Save the column schema info. We'll merge it into one list
parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata);
if (statsAvailable) {
// Write stats when they are not null
Object minValue = null;
Object maxValue = null;
if (stats.genericGetMax() != null && stats.genericGetMin() != null) {
minValue = stats.genericGetMin();
maxValue = stats.genericGetMax();
if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) {
minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
}
}
columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), minValue, maxValue, stats.getNumNulls());
} else {
columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), null, null, null);
}
columnMetadataList.add(columnMetadata);
length += col.getTotalSize();
}
// Note we still read the schema even if there are no values in the RowGroup
if (rowGroup.getRowCount() == 0) {
continue;
}
RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
rowGroupMetadataList.add(rowGroupMeta);
}
String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}
use of org.apache.drill.common.expression.SchemaPath in project drill by apache.
the class ParquetRGFilterEvaluator method evalFilter.
public static boolean evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext, Map<String, String> implicitColValues) {
// figure out the set of columns referenced in expression.
final Set<SchemaPath> schemaPathsInExpr = expr.accept(new FieldReferenceFinder(), null);
final ColumnStatCollector columnStatCollector = new ParquetFooterStatCollector(footer, rowGroupIndex, implicitColValues, true, options);
Map<SchemaPath, ColumnStatistics> columnStatisticsMap = columnStatCollector.collectColStat(schemaPathsInExpr);
boolean canDrop = canDrop(expr, columnStatisticsMap, footer.getBlocks().get(rowGroupIndex).getRowCount(), fragmentContext, fragmentContext.getFunctionRegistry());
return canDrop;
}
Aggregations