Search in sources :

Example 21 with SchemaPath

use of org.apache.drill.common.expression.SchemaPath in project drill by apache.

the class ProjectRecordBatch method classifyExpr.

private void classifyExpr(final NamedExpression ex, final RecordBatch incoming, final ClassifierResult result) {
    final NameSegment expr = ((SchemaPath) ex.getExpr()).getRootSegment();
    final NameSegment ref = ex.getRef().getRootSegment();
    final boolean exprHasPrefix = expr.getPath().contains(StarColumnHelper.PREFIX_DELIMITER);
    final boolean refHasPrefix = ref.getPath().contains(StarColumnHelper.PREFIX_DELIMITER);
    final boolean exprIsStar = expr.getPath().equals(StarColumnHelper.STAR_COLUMN);
    final boolean refContainsStar = ref.getPath().contains(StarColumnHelper.STAR_COLUMN);
    final boolean exprContainsStar = expr.getPath().contains(StarColumnHelper.STAR_COLUMN);
    final boolean refEndsWithStar = ref.getPath().endsWith(StarColumnHelper.STAR_COLUMN);
    String exprPrefix = EMPTY_STRING;
    String exprSuffix = expr.getPath();
    if (exprHasPrefix) {
        // get the prefix of the expr
        final String[] exprComponents = expr.getPath().split(StarColumnHelper.PREFIX_DELIMITER, 2);
        assert (exprComponents.length == 2);
        exprPrefix = exprComponents[0];
        exprSuffix = exprComponents[1];
        result.prefix = exprPrefix;
    }
    boolean exprIsFirstWildcard = false;
    if (exprContainsStar) {
        result.isStar = true;
        final Integer value = (Integer) result.prefixMap.get(exprPrefix);
        if (value == null) {
            final Integer n = 1;
            result.prefixMap.put(exprPrefix, n);
            exprIsFirstWildcard = true;
        } else {
            final Integer n = value + 1;
            result.prefixMap.put(exprPrefix, n);
        }
    }
    final int incomingSchemaSize = incoming.getSchema().getFieldCount();
    // input is '*' and output is 'prefix_*'
    if (exprIsStar && refHasPrefix && refEndsWithStar) {
        final String[] components = ref.getPath().split(StarColumnHelper.PREFIX_DELIMITER, 2);
        assert (components.length == 2);
        final String prefix = components[0];
        result.outputNames = Lists.newArrayList();
        for (final VectorWrapper<?> wrapper : incoming) {
            final ValueVector vvIn = wrapper.getValueVector();
            final String name = vvIn.getField().getPath();
            // add the prefix to the incoming column name
            final String newName = prefix + StarColumnHelper.PREFIX_DELIMITER + name;
            addToResultMaps(newName, result, false);
        }
    } else // input and output are the same
    if (expr.getPath().equalsIgnoreCase(ref.getPath()) && (!exprContainsStar || exprIsFirstWildcard)) {
        if (exprContainsStar && exprHasPrefix) {
            assert exprPrefix != null;
            int k = 0;
            result.outputNames = Lists.newArrayListWithCapacity(incomingSchemaSize);
            for (int j = 0; j < incomingSchemaSize; j++) {
                // initialize
                result.outputNames.add(EMPTY_STRING);
            }
            for (final VectorWrapper<?> wrapper : incoming) {
                final ValueVector vvIn = wrapper.getValueVector();
                final String incomingName = vvIn.getField().getPath();
                // get the prefix of the name
                final String[] nameComponents = incomingName.split(StarColumnHelper.PREFIX_DELIMITER, 2);
                // if incoming valuevector does not have a prefix, ignore it since this expression is not referencing it
                if (nameComponents.length <= 1) {
                    k++;
                    continue;
                }
                final String namePrefix = nameComponents[0];
                if (exprPrefix.equalsIgnoreCase(namePrefix)) {
                    final String newName = incomingName;
                    if (!result.outputMap.containsKey(newName)) {
                        result.outputNames.set(k, newName);
                        result.outputMap.put(newName, newName);
                    }
                }
                k++;
            }
        } else {
            result.outputNames = Lists.newArrayList();
            if (exprContainsStar) {
                for (final VectorWrapper<?> wrapper : incoming) {
                    final ValueVector vvIn = wrapper.getValueVector();
                    final String incomingName = vvIn.getField().getPath();
                    if (refContainsStar) {
                        // allow dups since this is likely top-level project
                        addToResultMaps(incomingName, result, true);
                    } else {
                        addToResultMaps(incomingName, result, false);
                    }
                }
            } else {
                final String newName = expr.getPath();
                if (!refHasPrefix && !exprHasPrefix) {
                    // allow dups since this is likely top-level project
                    addToResultMaps(newName, result, true);
                } else {
                    addToResultMaps(newName, result, false);
                }
            }
        }
    } else // input is wildcard and it is not the first wildcard
    if (exprIsStar) {
        result.outputNames = Lists.newArrayList();
        for (final VectorWrapper<?> wrapper : incoming) {
            final ValueVector vvIn = wrapper.getValueVector();
            final String incomingName = vvIn.getField().getPath();
            // allow dups since this is likely top-level project
            addToResultMaps(incomingName, result, true);
        }
    } else // only the output has prefix
    if (!exprHasPrefix && refHasPrefix) {
        result.outputNames = Lists.newArrayList();
        final String newName = ref.getPath();
        addToResultMaps(newName, result, false);
    } else // input has prefix but output does not
    if (exprHasPrefix && !refHasPrefix) {
        int k = 0;
        result.outputNames = Lists.newArrayListWithCapacity(incomingSchemaSize);
        for (int j = 0; j < incomingSchemaSize; j++) {
            // initialize
            result.outputNames.add(EMPTY_STRING);
        }
        for (final VectorWrapper<?> wrapper : incoming) {
            final ValueVector vvIn = wrapper.getValueVector();
            final String name = vvIn.getField().getPath();
            final String[] components = name.split(StarColumnHelper.PREFIX_DELIMITER, 2);
            if (components.length <= 1) {
                k++;
                continue;
            }
            final String namePrefix = components[0];
            final String nameSuffix = components[1];
            if (exprPrefix.equalsIgnoreCase(namePrefix)) {
                // // case insensitive matching of prefix.
                if (refContainsStar) {
                    // remove the prefix from the incoming column names
                    // for top level we need to make names unique
                    final String newName = getUniqueName(nameSuffix, result);
                    result.outputNames.set(k, newName);
                } else if (exprSuffix.equalsIgnoreCase(nameSuffix)) {
                    // case insensitive matching of field name.
                    // example: ref: $f1, expr: T0<PREFIX><column_name>
                    final String newName = ref.getPath();
                    result.outputNames.set(k, newName);
                }
            } else {
                result.outputNames.add(EMPTY_STRING);
            }
            k++;
        }
    } else // input and output have prefixes although they could be different...
    if (exprHasPrefix && refHasPrefix) {
        final String[] input = expr.getPath().split(StarColumnHelper.PREFIX_DELIMITER, 2);
        assert (input.length == 2);
        // not handled yet
        assert false : "Unexpected project expression or reference";
    } else {
        // if the incoming schema's column name matches the expression name of the Project,
        // then we just want to pick the ref name as the output column name
        result.outputNames = Lists.newArrayList();
        for (final VectorWrapper<?> wrapper : incoming) {
            final ValueVector vvIn = wrapper.getValueVector();
            final String incomingName = vvIn.getField().getPath();
            if (expr.getPath().equalsIgnoreCase(incomingName)) {
                // case insensitive matching of field name.
                final String newName = ref.getPath();
                addToResultMaps(newName, result, true);
            }
        }
    }
}
Also used : ValueVector(org.apache.drill.exec.vector.ValueVector) NameSegment(org.apache.drill.common.expression.PathSegment.NameSegment) SchemaPath(org.apache.drill.common.expression.SchemaPath) VectorWrapper(org.apache.drill.exec.record.VectorWrapper)

Example 22 with SchemaPath

use of org.apache.drill.common.expression.SchemaPath in project drill by apache.

the class ProjectRecordBatch method isClassificationNeeded.

private boolean isClassificationNeeded(final List<NamedExpression> exprs) {
    boolean needed = false;
    for (int i = 0; i < exprs.size(); i++) {
        final NamedExpression ex = exprs.get(i);
        if (!(ex.getExpr() instanceof SchemaPath)) {
            continue;
        }
        final NameSegment expr = ((SchemaPath) ex.getExpr()).getRootSegment();
        final NameSegment ref = ex.getRef().getRootSegment();
        final boolean refHasPrefix = ref.getPath().contains(StarColumnHelper.PREFIX_DELIMITER);
        final boolean exprContainsStar = expr.getPath().contains(StarColumnHelper.STAR_COLUMN);
        if (refHasPrefix || exprContainsStar) {
            needed = true;
            break;
        }
    }
    return needed;
}
Also used : NameSegment(org.apache.drill.common.expression.PathSegment.NameSegment) SchemaPath(org.apache.drill.common.expression.SchemaPath) NamedExpression(org.apache.drill.common.logical.data.NamedExpression)

Example 23 with SchemaPath

use of org.apache.drill.common.expression.SchemaPath in project drill by apache.

the class ParquetGroupScan method applyFilter.

public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
    if (fileSet.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
        //    -  # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
        return null;
    }
    final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
    final List<RowGroupMetadata> qualifiedRGs = new ArrayList<>(parquetTableMetadata.getFiles().size());
    // HashSet keeps a fileName unique.
    Set<String> qualifiedFileNames = Sets.newHashSet();
    ParquetFilterPredicate filterPredicate = null;
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        final ImplicitColumnExplorer columnExplorer = new ImplicitColumnExplorer(optionManager, this.columns);
        Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(file.getPath(), selectionRoot);
        for (RowGroupMetadata rowGroup : file.getRowGroups()) {
            ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
            Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
            if (filterPredicate == null) {
                ErrorCollector errorCollector = new ErrorCollectorImpl();
                LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
                if (errorCollector.hasErrors()) {
                    logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
                    return null;
                }
                //    logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
                Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
                filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
                if (filterPredicate == null) {
                    return null;
                }
            }
            if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
                continue;
            }
            qualifiedRGs.add(rowGroup);
            // TODO : optimize when 1 file contains m row groups.
            qualifiedFileNames.add(file.getPath());
        }
    }
    if (qualifiedFileNames.size() == fileSet.size()) {
        // There is no reduction of rowGroups. Return the original groupScan.
        logger.debug("applyFilter does not have any pruning!");
        return null;
    } else if (qualifiedFileNames.size() == 0) {
        logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
        qualifiedFileNames.add(fileSet.iterator().next());
    }
    try {
        FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
        logger.info("applyFilter {} reduce parquet file # from {} to {}", ExpressionStringBuilder.toString(filterExpr), fileSet.size(), qualifiedFileNames.size());
        return this.clone(newSelection);
    } catch (IOException e) {
        logger.warn("Could not apply filter prune due to Exception : {}", e);
        return null;
    }
}
Also used : ImplicitColumnExplorer(org.apache.drill.exec.store.ImplicitColumnExplorer) ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) ArrayList(java.util.ArrayList) ErrorCollector(org.apache.drill.common.expression.ErrorCollector) IOException(java.io.IOException) RowGroupMetadata(org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata) ErrorCollectorImpl(org.apache.drill.common.expression.ErrorCollectorImpl) LogicalExpression(org.apache.drill.common.expression.LogicalExpression) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFilterPredicate(org.apache.drill.exec.expr.stat.ParquetFilterPredicate) ParquetMetaStatCollector(org.apache.drill.exec.store.parquet.stat.ParquetMetaStatCollector)

Example 24 with SchemaPath

use of org.apache.drill.common.expression.SchemaPath in project drill by apache.

the class Metadata method getParquetFileMetadata_v3.

/**
   * Get the metadata for a single file
   *
   * @param file
   * @return
   * @throws IOException
   */
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, FileStatus file) throws IOException {
    ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
    MessageType schema = metadata.getFileMetaData().getSchema();
    //    Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
    Map<SchemaPath, ColTypeInfo> colTypeInfoMap = Maps.newHashMap();
    schema.getPaths();
    for (String[] path : schema.getPaths()) {
        colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
    }
    List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();
    ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
    ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
    boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
    if (logger.isDebugEnabled()) {
        logger.debug(containsCorruptDates.toString());
    }
    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<ColumnMetadata_v3> columnMetadataList = Lists.newArrayList();
        long length = 0;
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            ColumnMetadata_v3 columnMetadata;
            boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
            Statistics<?> stats = col.getStatistics();
            String[] columnName = col.getPath().toArray();
            SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
            ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);
            ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getType(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel);
            if (parquetTableMetadata.columnTypeInfo == null) {
                parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
            }
            // Save the column schema info. We'll merge it into one list
            parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata);
            if (statsAvailable) {
                // Write stats when they are not null
                Object minValue = null;
                Object maxValue = null;
                if (stats.genericGetMax() != null && stats.genericGetMin() != null) {
                    minValue = stats.genericGetMin();
                    maxValue = stats.genericGetMax();
                    if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) {
                        minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
                        maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
                    }
                }
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), minValue, maxValue, stats.getNumNulls());
            } else {
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), null, null, null);
            }
            columnMetadataList.add(columnMetadata);
            length += col.getTotalSize();
        }
        // Note we still read the schema even if there are no values in the RowGroup
        if (rowGroup.getRowCount() == 0) {
            continue;
        }
        RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
        rowGroupMetadataList.add(rowGroupMeta);
    }
    String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
    return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ArrayList(java.util.ArrayList) SchemaPath(org.apache.drill.common.expression.SchemaPath) MessageType(org.apache.parquet.schema.MessageType)

Example 25 with SchemaPath

use of org.apache.drill.common.expression.SchemaPath in project drill by apache.

the class ParquetRGFilterEvaluator method evalFilter.

public static boolean evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext, Map<String, String> implicitColValues) {
    // figure out the set of columns referenced in expression.
    final Set<SchemaPath> schemaPathsInExpr = expr.accept(new FieldReferenceFinder(), null);
    final ColumnStatCollector columnStatCollector = new ParquetFooterStatCollector(footer, rowGroupIndex, implicitColValues, true, options);
    Map<SchemaPath, ColumnStatistics> columnStatisticsMap = columnStatCollector.collectColStat(schemaPathsInExpr);
    boolean canDrop = canDrop(expr, columnStatisticsMap, footer.getBlocks().get(rowGroupIndex).getRowCount(), fragmentContext, fragmentContext.getFunctionRegistry());
    return canDrop;
}
Also used : ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnStatCollector(org.apache.drill.exec.store.parquet.stat.ColumnStatCollector) ParquetFooterStatCollector(org.apache.drill.exec.store.parquet.stat.ParquetFooterStatCollector)

Aggregations

SchemaPath (org.apache.drill.common.expression.SchemaPath)74 Test (org.junit.Test)23 FunctionImplementationRegistry (org.apache.drill.exec.expr.fn.FunctionImplementationRegistry)17 FragmentContext (org.apache.drill.exec.ops.FragmentContext)16 PhysicalPlan (org.apache.drill.exec.physical.PhysicalPlan)15 FragmentRoot (org.apache.drill.exec.physical.base.FragmentRoot)15 PhysicalPlanReader (org.apache.drill.exec.planner.PhysicalPlanReader)15 LogicalExpression (org.apache.drill.common.expression.LogicalExpression)12 ExecTest (org.apache.drill.exec.ExecTest)12 BigIntVector (org.apache.drill.exec.vector.BigIntVector)9 ExecutionSetupException (org.apache.drill.common.exceptions.ExecutionSetupException)8 IntVector (org.apache.drill.exec.vector.IntVector)8 Path (org.apache.hadoop.fs.Path)8 SchemaChangeException (org.apache.drill.exec.exception.SchemaChangeException)7 ValueVector (org.apache.drill.exec.vector.ValueVector)7 IOException (java.io.IOException)6 ErrorCollector (org.apache.drill.common.expression.ErrorCollector)6 TypedFieldId (org.apache.drill.exec.record.TypedFieldId)6 ErrorCollectorImpl (org.apache.drill.common.expression.ErrorCollectorImpl)5 FieldReference (org.apache.drill.common.expression.FieldReference)5