Search in sources :

Example 26 with SchemaPath

use of org.apache.drill.common.expression.SchemaPath in project drill by apache.

the class ProjectRecordBatch method isClassificationNeeded.

private boolean isClassificationNeeded(final List<NamedExpression> exprs) {
    boolean needed = false;
    for (int i = 0; i < exprs.size(); i++) {
        final NamedExpression ex = exprs.get(i);
        if (!(ex.getExpr() instanceof SchemaPath)) {
            continue;
        }
        final NameSegment expr = ((SchemaPath) ex.getExpr()).getRootSegment();
        final NameSegment ref = ex.getRef().getRootSegment();
        final boolean refHasPrefix = ref.getPath().contains(StarColumnHelper.PREFIX_DELIMITER);
        final boolean exprContainsStar = expr.getPath().contains(StarColumnHelper.STAR_COLUMN);
        if (refHasPrefix || exprContainsStar) {
            needed = true;
            break;
        }
    }
    return needed;
}
Also used : NameSegment(org.apache.drill.common.expression.PathSegment.NameSegment) SchemaPath(org.apache.drill.common.expression.SchemaPath) NamedExpression(org.apache.drill.common.logical.data.NamedExpression)

Example 27 with SchemaPath

use of org.apache.drill.common.expression.SchemaPath in project drill by apache.

the class ParquetGroupScan method applyFilter.

public GroupScan applyFilter(LogicalExpression filterExpr, UdfUtilities udfUtilities, FunctionImplementationRegistry functionImplementationRegistry, OptionManager optionManager) {
    if (fileSet.size() == 1 || !(parquetTableMetadata.isRowGroupPrunable()) || rowGroupInfos.size() > optionManager.getOption(PlannerSettings.PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD)) {
        //    -  # of row groups is beyond PARQUET_ROWGROUP_FILTER_PUSHDOWN_PLANNING_THRESHOLD.
        return null;
    }
    final Set<SchemaPath> schemaPathsInExpr = filterExpr.accept(new ParquetRGFilterEvaluator.FieldReferenceFinder(), null);
    final List<RowGroupMetadata> qualifiedRGs = new ArrayList<>(parquetTableMetadata.getFiles().size());
    // HashSet keeps a fileName unique.
    Set<String> qualifiedFileNames = Sets.newHashSet();
    ParquetFilterPredicate filterPredicate = null;
    for (ParquetFileMetadata file : parquetTableMetadata.getFiles()) {
        final ImplicitColumnExplorer columnExplorer = new ImplicitColumnExplorer(optionManager, this.columns);
        Map<String, String> implicitColValues = columnExplorer.populateImplicitColumns(file.getPath(), selectionRoot);
        for (RowGroupMetadata rowGroup : file.getRowGroups()) {
            ParquetMetaStatCollector statCollector = new ParquetMetaStatCollector(parquetTableMetadata, rowGroup.getColumns(), implicitColValues);
            Map<SchemaPath, ColumnStatistics> columnStatisticsMap = statCollector.collectColStat(schemaPathsInExpr);
            if (filterPredicate == null) {
                ErrorCollector errorCollector = new ErrorCollectorImpl();
                LogicalExpression materializedFilter = ExpressionTreeMaterializer.materializeFilterExpr(filterExpr, columnStatisticsMap, errorCollector, functionImplementationRegistry);
                if (errorCollector.hasErrors()) {
                    logger.error("{} error(s) encountered when materialize filter expression : {}", errorCollector.getErrorCount(), errorCollector.toErrorString());
                    return null;
                }
                //    logger.debug("materializedFilter : {}", ExpressionStringBuilder.toString(materializedFilter));
                Set<LogicalExpression> constantBoundaries = ConstantExpressionIdentifier.getConstantExpressionSet(materializedFilter);
                filterPredicate = (ParquetFilterPredicate) ParquetFilterBuilder.buildParquetFilterPredicate(materializedFilter, constantBoundaries, udfUtilities);
                if (filterPredicate == null) {
                    return null;
                }
            }
            if (ParquetRGFilterEvaluator.canDrop(filterPredicate, columnStatisticsMap, rowGroup.getRowCount())) {
                continue;
            }
            qualifiedRGs.add(rowGroup);
            // TODO : optimize when 1 file contains m row groups.
            qualifiedFileNames.add(file.getPath());
        }
    }
    if (qualifiedFileNames.size() == fileSet.size()) {
        // There is no reduction of rowGroups. Return the original groupScan.
        logger.debug("applyFilter does not have any pruning!");
        return null;
    } else if (qualifiedFileNames.size() == 0) {
        logger.warn("All rowgroups have been filtered out. Add back one to get schema from scannner");
        qualifiedFileNames.add(fileSet.iterator().next());
    }
    try {
        FileSelection newSelection = new FileSelection(null, Lists.newArrayList(qualifiedFileNames), getSelectionRoot(), cacheFileRoot, false);
        logger.info("applyFilter {} reduce parquet file # from {} to {}", ExpressionStringBuilder.toString(filterExpr), fileSet.size(), qualifiedFileNames.size());
        return this.clone(newSelection);
    } catch (IOException e) {
        logger.warn("Could not apply filter prune due to Exception : {}", e);
        return null;
    }
}
Also used : ImplicitColumnExplorer(org.apache.drill.exec.store.ImplicitColumnExplorer) ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) ParquetFileMetadata(org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata) ArrayList(java.util.ArrayList) ErrorCollector(org.apache.drill.common.expression.ErrorCollector) IOException(java.io.IOException) RowGroupMetadata(org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata) ErrorCollectorImpl(org.apache.drill.common.expression.ErrorCollectorImpl) LogicalExpression(org.apache.drill.common.expression.LogicalExpression) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFilterPredicate(org.apache.drill.exec.expr.stat.ParquetFilterPredicate) ParquetMetaStatCollector(org.apache.drill.exec.store.parquet.stat.ParquetMetaStatCollector)

Example 28 with SchemaPath

use of org.apache.drill.common.expression.SchemaPath in project drill by apache.

the class Metadata method getParquetFileMetadata_v3.

/**
   * Get the metadata for a single file
   *
   * @param file
   * @return
   * @throws IOException
   */
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, FileStatus file) throws IOException {
    ParquetMetadata metadata = ParquetFileReader.readFooter(fs.getConf(), file);
    MessageType schema = metadata.getFileMetaData().getSchema();
    //    Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
    Map<SchemaPath, ColTypeInfo> colTypeInfoMap = Maps.newHashMap();
    schema.getPaths();
    for (String[] path : schema.getPaths()) {
        colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
    }
    List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();
    ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
    ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
    boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
    if (logger.isDebugEnabled()) {
        logger.debug(containsCorruptDates.toString());
    }
    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<ColumnMetadata_v3> columnMetadataList = Lists.newArrayList();
        long length = 0;
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            ColumnMetadata_v3 columnMetadata;
            boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
            Statistics<?> stats = col.getStatistics();
            String[] columnName = col.getPath().toArray();
            SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
            ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);
            ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getType(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel);
            if (parquetTableMetadata.columnTypeInfo == null) {
                parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
            }
            // Save the column schema info. We'll merge it into one list
            parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata);
            if (statsAvailable) {
                // Write stats when they are not null
                Object minValue = null;
                Object maxValue = null;
                if (stats.genericGetMax() != null && stats.genericGetMin() != null) {
                    minValue = stats.genericGetMin();
                    maxValue = stats.genericGetMax();
                    if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) {
                        minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
                        maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
                    }
                }
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), minValue, maxValue, stats.getNumNulls());
            } else {
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), null, null, null);
            }
            columnMetadataList.add(columnMetadata);
            length += col.getTotalSize();
        }
        // Note we still read the schema even if there are no values in the RowGroup
        if (rowGroup.getRowCount() == 0) {
            continue;
        }
        RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
        rowGroupMetadataList.add(rowGroupMeta);
    }
    String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
    return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ArrayList(java.util.ArrayList) SchemaPath(org.apache.drill.common.expression.SchemaPath) MessageType(org.apache.parquet.schema.MessageType)

Example 29 with SchemaPath

use of org.apache.drill.common.expression.SchemaPath in project drill by apache.

the class ParquetRGFilterEvaluator method evalFilter.

public static boolean evalFilter(LogicalExpression expr, ParquetMetadata footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext, Map<String, String> implicitColValues) {
    // figure out the set of columns referenced in expression.
    final Set<SchemaPath> schemaPathsInExpr = expr.accept(new FieldReferenceFinder(), null);
    final ColumnStatCollector columnStatCollector = new ParquetFooterStatCollector(footer, rowGroupIndex, implicitColValues, true, options);
    Map<SchemaPath, ColumnStatistics> columnStatisticsMap = columnStatCollector.collectColStat(schemaPathsInExpr);
    boolean canDrop = canDrop(expr, columnStatisticsMap, footer.getBlocks().get(rowGroupIndex).getRowCount(), fragmentContext, fragmentContext.getFunctionRegistry());
    return canDrop;
}
Also used : ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnStatCollector(org.apache.drill.exec.store.parquet.stat.ColumnStatCollector) ParquetFooterStatCollector(org.apache.drill.exec.store.parquet.stat.ParquetFooterStatCollector)

Example 30 with SchemaPath

use of org.apache.drill.common.expression.SchemaPath in project drill by apache.

the class MockGroupScanPOP method clone.

@Override
public GroupScan clone(List<SchemaPath> columns) {
    if (columns.isEmpty()) {
        throw new IllegalArgumentException("No columns for mock scan");
    }
    List<MockColumn> mockCols = new ArrayList<>();
    Pattern p = Pattern.compile("(\\w+)_([isdb])(\\d*)");
    for (SchemaPath path : columns) {
        String col = path.getLastSegment().getNameSegment().getPath();
        if (col.equals("*")) {
            return this;
        }
        Matcher m = p.matcher(col);
        if (!m.matches()) {
            throw new IllegalArgumentException("Badly formatted mock column name: " + col);
        }
        @SuppressWarnings("unused") String name = m.group(1);
        String type = m.group(2);
        String length = m.group(3);
        int width = 10;
        if (!length.isEmpty()) {
            width = Integer.parseInt(length);
        }
        MinorType minorType;
        switch(type) {
            case "i":
                minorType = MinorType.INT;
                break;
            case "s":
                minorType = MinorType.VARCHAR;
                break;
            case "d":
                minorType = MinorType.FLOAT8;
                break;
            case "b":
                minorType = MinorType.BIT;
                break;
            default:
                throw new IllegalArgumentException("Unsupported field type " + type + " for mock column " + col);
        }
        MockTableDef.MockColumn mockCol = new MockColumn(col, minorType, DataMode.REQUIRED, width, 0, 0, null, 1, null);
        mockCols.add(mockCol);
    }
    MockScanEntry entry = readEntries.get(0);
    MockColumn[] types = new MockColumn[mockCols.size()];
    mockCols.toArray(types);
    MockScanEntry newEntry = new MockScanEntry(entry.records, true, 0, 1, types);
    List<MockScanEntry> newEntries = new ArrayList<>();
    newEntries.add(newEntry);
    return new MockGroupScanPOP(url, newEntries);
}
Also used : Pattern(java.util.regex.Pattern) MockScanEntry(org.apache.drill.exec.store.mock.MockTableDef.MockScanEntry) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) DrillbitEndpoint(org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint) SchemaPath(org.apache.drill.common.expression.SchemaPath) MockColumn(org.apache.drill.exec.store.mock.MockTableDef.MockColumn) MinorType(org.apache.drill.common.types.TypeProtos.MinorType) MockColumn(org.apache.drill.exec.store.mock.MockTableDef.MockColumn)

Aggregations

SchemaPath (org.apache.drill.common.expression.SchemaPath)74 Test (org.junit.Test)23 FunctionImplementationRegistry (org.apache.drill.exec.expr.fn.FunctionImplementationRegistry)17 FragmentContext (org.apache.drill.exec.ops.FragmentContext)16 PhysicalPlan (org.apache.drill.exec.physical.PhysicalPlan)15 FragmentRoot (org.apache.drill.exec.physical.base.FragmentRoot)15 PhysicalPlanReader (org.apache.drill.exec.planner.PhysicalPlanReader)15 LogicalExpression (org.apache.drill.common.expression.LogicalExpression)12 ExecTest (org.apache.drill.exec.ExecTest)12 BigIntVector (org.apache.drill.exec.vector.BigIntVector)9 ExecutionSetupException (org.apache.drill.common.exceptions.ExecutionSetupException)8 IntVector (org.apache.drill.exec.vector.IntVector)8 Path (org.apache.hadoop.fs.Path)8 SchemaChangeException (org.apache.drill.exec.exception.SchemaChangeException)7 ValueVector (org.apache.drill.exec.vector.ValueVector)7 IOException (java.io.IOException)6 ErrorCollector (org.apache.drill.common.expression.ErrorCollector)6 TypedFieldId (org.apache.drill.exec.record.TypedFieldId)6 ErrorCollectorImpl (org.apache.drill.common.expression.ErrorCollectorImpl)5 FieldReference (org.apache.drill.common.expression.FieldReference)5