Search in sources :

Example 1 with Multimap

use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.

the class HiveFunctionRegistry method register.

private <I> void register(Class<? extends I> clazz, Multimap<String, Class<? extends I>> methods) {
    Description desc = clazz.getAnnotation(Description.class);
    Stream<String> namesStream;
    if (desc != null) {
        namesStream = Stream.of(desc.name().split(",")).map(String::trim);
    } else {
        namesStream = Stream.of(clazz).map(Class::getName).map(name -> name.replace('.', '_'));
    }
    // Checks specified array of function names whether they should be replaced
    // using FUNCTION_REPLACE_MAP map.
    namesStream.map(String::toLowerCase).map(functionName -> FUNCTION_REPLACE_MAP.getOrDefault(functionName, functionName)).forEach(name -> methods.put(name, clazz));
    UDFType type = clazz.getAnnotation(UDFType.class);
    if (type != null && !type.deterministic()) {
        nonDeterministicUDFs.add(clazz);
    }
}
Also used : UDFType(org.apache.hadoop.hive.ql.udf.UDFType) ObjectInspectorHelper(org.apache.drill.exec.expr.fn.impl.hive.ObjectInspectorHelper) ClassPathScanner(org.apache.drill.common.scanner.ClassPathScanner) UserException(org.apache.drill.common.exceptions.UserException) LoggerFactory(org.slf4j.LoggerFactory) Types(org.apache.drill.common.types.Types) Sets(org.apache.drill.shaded.guava.com.google.common.collect.Sets) Description(org.apache.hadoop.hive.ql.exec.Description) ArrayListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap) HashSet(java.util.HashSet) UDF(org.apache.hadoop.hive.ql.exec.UDF) MajorType(org.apache.drill.common.types.TypeProtos.MajorType) FunctionCall(org.apache.drill.common.expression.FunctionCall) Map(java.util.Map) HiveUDFOperatorWithoutInference(org.apache.drill.exec.planner.sql.HiveUDFOperatorWithoutInference) DrillOperatorTable(org.apache.drill.exec.planner.sql.DrillOperatorTable) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) HiveUDFOperator(org.apache.drill.exec.planner.sql.HiveUDFOperator) SqlOperatorBinding(org.apache.calcite.sql.SqlOperatorBinding) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) RelDataType(org.apache.calcite.rel.type.RelDataType) Logger(org.slf4j.Logger) SqlTypeName(org.apache.calcite.sql.type.SqlTypeName) TypeInferenceUtils(org.apache.drill.exec.planner.sql.TypeInferenceUtils) GenericUDFBridge(org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge) Set(java.util.Set) SqlReturnTypeInference(org.apache.calcite.sql.type.SqlReturnTypeInference) TypeProtos(org.apache.drill.common.types.TypeProtos) Stream(java.util.stream.Stream) DrillConfig(org.apache.drill.common.config.DrillConfig) ImmutableMap(org.apache.drill.shaded.guava.com.google.common.collect.ImmutableMap) SqlStdOperatorTable(org.apache.calcite.sql.fun.SqlStdOperatorTable) OracleSqlOperatorTable(org.apache.calcite.sql.fun.OracleSqlOperatorTable) ScanResult(org.apache.drill.common.scanner.persistence.ScanResult) Description(org.apache.hadoop.hive.ql.exec.Description) UDFType(org.apache.hadoop.hive.ql.udf.UDFType)

Example 2 with Multimap

use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.

the class AbstractParquetGroupScan method modifyFileSelection.

// limit push down methods end
// helper method used for partition pruning and filter push down
@Override
public void modifyFileSelection(FileSelection selection) {
    super.modifyFileSelection(selection);
    List<Path> files = selection.getFiles();
    fileSet = new HashSet<>(files);
    entries = new ArrayList<>(files.size());
    entries.addAll(files.stream().map(ReadEntryWithPath::new).collect(Collectors.toList()));
    Multimap<Path, RowGroupMetadata> newRowGroups = LinkedListMultimap.create();
    if (!getRowGroupsMetadata().isEmpty()) {
        getRowGroupsMetadata().entries().stream().filter(entry -> fileSet.contains(entry.getKey())).forEachOrdered(entry -> newRowGroups.put(entry.getKey(), entry.getValue()));
    }
    this.rowGroups = newRowGroups;
    tableMetadata = TableMetadataUtils.updateRowCount(getTableMetadata(), getRowGroupsMetadata().values());
    if (!getFilesMetadata().isEmpty()) {
        this.files = getFilesMetadata().entrySet().stream().filter(entry -> fileSet.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    } else {
        this.files = Collections.emptyMap();
    }
    List<PartitionMetadata> newPartitions = new ArrayList<>();
    if (!getPartitionsMetadata().isEmpty()) {
        for (PartitionMetadata entry : getPartitionsMetadata()) {
            for (Path partLocation : entry.getLocations()) {
                if (fileSet.contains(partLocation)) {
                    newPartitions.add(entry);
                    break;
                }
            }
        }
    }
    partitions = newPartitions;
    if (!getSegmentsMetadata().isEmpty()) {
        this.segments = getSegmentsMetadata().entrySet().stream().filter(entry -> fileSet.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    }
    rowGroupInfos = null;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataType(org.apache.drill.metastore.metadata.MetadataType) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) LoggerFactory(org.slf4j.LoggerFactory) ArrayListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap) FunctionImplementationRegistry(org.apache.drill.exec.expr.fn.FunctionImplementationRegistry) ExpressionStringBuilder(org.apache.drill.common.expression.ExpressionStringBuilder) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) ListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap) UdfUtilities(org.apache.drill.exec.ops.UdfUtilities) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) Collection(java.util.Collection) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) Set(java.util.Set) LogicalExpression(org.apache.drill.common.expression.LogicalExpression) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) AbstractGroupScanWithMetadata(org.apache.drill.exec.physical.base.AbstractGroupScanWithMetadata) List(java.util.List) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataProviderManager(org.apache.drill.exec.metastore.MetadataProviderManager) TableMetadataUtils(org.apache.drill.metastore.util.TableMetadataUtils) FilterPredicate(org.apache.drill.exec.expr.FilterPredicate) OptionManager(org.apache.drill.exec.server.options.OptionManager) HashMap(java.util.HashMap) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) CoordinationProtos(org.apache.drill.exec.proto.CoordinationProtos) AffinityCreator(org.apache.drill.exec.store.schedule.AffinityCreator) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl) CollectionUtils(org.apache.commons.collections.CollectionUtils) JsonIgnore(com.fasterxml.jackson.annotation.JsonIgnore) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) Logger(org.slf4j.Logger) MapUtils(org.apache.commons.collections.MapUtils) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) IOException(java.io.IOException) ParquetMetadataProvider(org.apache.drill.exec.metastore.store.parquet.ParquetMetadataProvider) LinkedListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.LinkedListMultimap) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) Statistic(org.apache.drill.metastore.statistics.Statistic) LocationProvider(org.apache.drill.metastore.metadata.LocationProvider) EndpointAffinity(org.apache.drill.exec.physical.EndpointAffinity) GroupScan(org.apache.drill.exec.physical.base.GroupScan) JsonInclude(com.fasterxml.jackson.annotation.JsonInclude) Collections(java.util.Collections) ParquetMetadataProviderBuilder(org.apache.drill.exec.metastore.store.parquet.ParquetMetadataProviderBuilder) AssignmentCreator(org.apache.drill.exec.store.schedule.AssignmentCreator) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ArrayList(java.util.ArrayList) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Example 3 with Multimap

use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.

the class MetastoreAnalyzeTableHandler method convertToDrel.

/**
 * Converts to Drill logical plan
 */
private DrillRel convertToDrel(RelNode relNode, SqlMetastoreAnalyzeTable sqlAnalyzeTable, DrillTableInfo drillTableInfo) throws ForemanSetupException, IOException {
    RelBuilder relBuilder = LOGICAL_BUILDER.create(relNode.getCluster(), null);
    DrillTable table = drillTableInfo.drillTable();
    AnalyzeInfoProvider analyzeInfoProvider = table.getGroupScan().getAnalyzeInfoProvider();
    List<String> schemaPath = drillTableInfo.schemaPath();
    String pluginName = schemaPath.get(0);
    String workspaceName = Strings.join(schemaPath.subList(1, schemaPath.size()), AbstractSchema.SCHEMA_SEPARATOR);
    String tableName = drillTableInfo.tableName();
    TableInfo tableInfo = TableInfo.builder().name(tableName).owner(table.getUserName()).type(analyzeInfoProvider.getTableTypeName()).storagePlugin(pluginName).workspace(workspaceName).build();
    ColumnNamesOptions columnNamesOptions = new ColumnNamesOptions(context.getOptions());
    List<String> segmentColumns = analyzeInfoProvider.getSegmentColumns(table, columnNamesOptions).stream().map(SchemaPath::getRootSegmentPath).collect(Collectors.toList());
    List<NamedExpression> segmentExpressions = segmentColumns.stream().map(partitionName -> new NamedExpression(SchemaPath.getSimplePath(partitionName), FieldReference.getWithQuotedRef(partitionName))).collect(Collectors.toList());
    List<MetadataInfo> rowGroupsInfo = Collections.emptyList();
    List<MetadataInfo> filesInfo = Collections.emptyList();
    Multimap<Integer, MetadataInfo> segments = ArrayListMultimap.create();
    BasicTablesRequests basicRequests;
    try {
        basicRequests = context.getMetastoreRegistry().get().tables().basicRequests();
    } catch (MetastoreException e) {
        logger.error("Error when obtaining Metastore instance for table {}", tableName, e);
        DrillRel convertedRelNode = convertToRawDrel(relBuilder.values(new String[] { MetastoreAnalyzeConstants.OK_FIELD_NAME, MetastoreAnalyzeConstants.SUMMARY_FIELD_NAME }, false, e.getMessage()).build());
        return new DrillScreenRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertedRelNode);
    }
    MetadataType metadataLevel = getMetadataType(sqlAnalyzeTable);
    List<SchemaPath> interestingColumns = sqlAnalyzeTable.getFieldNames();
    MetastoreTableInfo metastoreTableInfo = basicRequests.metastoreTableInfo(tableInfo);
    List<MetadataInfo> allMetaToHandle = null;
    List<MetadataInfo> metadataToRemove = new ArrayList<>();
    // whether incremental analyze may be produced
    if (metastoreTableInfo.isExists()) {
        RelNode finalRelNode = relNode;
        CheckedSupplier<TableScan, SqlUnsupportedException> tableScanSupplier = () -> DrillRelOptUtil.findScan(convertToDrel(finalRelNode.getInput(0)));
        MetadataInfoCollector metadataInfoCollector = analyzeInfoProvider.getMetadataInfoCollector(basicRequests, tableInfo, (FormatSelection) table.getSelection(), context.getPlannerSettings(), tableScanSupplier, interestingColumns, metadataLevel, segmentColumns.size());
        if (!metadataInfoCollector.isOutdated()) {
            DrillRel convertedRelNode = convertToRawDrel(relBuilder.values(new String[] { MetastoreAnalyzeConstants.OK_FIELD_NAME, MetastoreAnalyzeConstants.SUMMARY_FIELD_NAME }, false, "Table metadata is up to date, analyze wasn't performed.").build());
            return new DrillScreenRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertedRelNode);
        }
        // updates scan to read updated / new files, pass removed files into metadata handler
        relNode = relNode.copy(relNode.getTraitSet(), Collections.singletonList(metadataInfoCollector.getPrunedScan()));
        filesInfo = metadataInfoCollector.getFilesInfo();
        segments = metadataInfoCollector.getSegmentsInfo();
        rowGroupsInfo = metadataInfoCollector.getRowGroupsInfo();
        allMetaToHandle = metadataInfoCollector.getAllMetaToHandle();
        metadataToRemove = metadataInfoCollector.getMetadataToRemove();
    }
    // Step 2: constructs plan for producing analyze
    DrillRel convertedRelNode = convertToRawDrel(relNode);
    boolean createNewAggregations = true;
    // List of columns for which statistics should be collected: interesting columns + segment columns
    List<SchemaPath> statisticsColumns = interestingColumns == null ? null : new ArrayList<>(interestingColumns);
    if (statisticsColumns != null) {
        segmentColumns.stream().map(SchemaPath::getSimplePath).forEach(statisticsColumns::add);
    }
    SchemaPath locationField = analyzeInfoProvider.getLocationField(columnNamesOptions);
    if (analyzeInfoProvider.supportsMetadataType(MetadataType.ROW_GROUP) && metadataLevel.includes(MetadataType.ROW_GROUP)) {
        MetadataHandlerContext handlerContext = MetadataHandlerContext.builder().tableInfo(tableInfo).metadataToHandle(rowGroupsInfo).metadataType(MetadataType.ROW_GROUP).depthLevel(segmentExpressions.size()).segmentColumns(segmentColumns).build();
        convertedRelNode = getRowGroupAggRelNode(segmentExpressions, convertedRelNode, createNewAggregations, statisticsColumns, handlerContext);
        createNewAggregations = false;
        locationField = SchemaPath.getSimplePath(MetastoreAnalyzeConstants.LOCATION_FIELD);
    }
    if (analyzeInfoProvider.supportsMetadataType(MetadataType.FILE) && metadataLevel.includes(MetadataType.FILE)) {
        MetadataHandlerContext handlerContext = MetadataHandlerContext.builder().tableInfo(tableInfo).metadataToHandle(filesInfo).metadataType(MetadataType.FILE).depthLevel(segmentExpressions.size()).segmentColumns(segmentColumns).build();
        convertedRelNode = getFileAggRelNode(segmentExpressions, convertedRelNode, createNewAggregations, statisticsColumns, locationField, handlerContext);
        locationField = SchemaPath.getSimplePath(MetastoreAnalyzeConstants.LOCATION_FIELD);
        createNewAggregations = false;
    }
    if (analyzeInfoProvider.supportsMetadataType(MetadataType.SEGMENT) && metadataLevel.includes(MetadataType.SEGMENT)) {
        for (int i = segmentExpressions.size(); i > 0; i--) {
            MetadataHandlerContext handlerContext = MetadataHandlerContext.builder().tableInfo(tableInfo).metadataToHandle(new ArrayList<>(segments.get(i - 1))).metadataType(MetadataType.SEGMENT).depthLevel(i).segmentColumns(segmentColumns.subList(0, i)).build();
            convertedRelNode = getSegmentAggRelNode(segmentExpressions, convertedRelNode, createNewAggregations, statisticsColumns, locationField, i, handlerContext);
            locationField = SchemaPath.getSimplePath(MetastoreAnalyzeConstants.LOCATION_FIELD);
            createNewAggregations = false;
        }
    }
    if (analyzeInfoProvider.supportsMetadataType(MetadataType.TABLE) && metadataLevel.includes(MetadataType.TABLE)) {
        MetadataHandlerContext handlerContext = MetadataHandlerContext.builder().tableInfo(tableInfo).metadataToHandle(Collections.emptyList()).metadataType(MetadataType.TABLE).depthLevel(segmentExpressions.size()).segmentColumns(segmentColumns).build();
        convertedRelNode = getTableAggRelNode(convertedRelNode, createNewAggregations, statisticsColumns, locationField, handlerContext);
    } else {
        throw new IllegalStateException("Analyze table with NONE level");
    }
    boolean useStatistics = context.getOptions().getOption(PlannerSettings.STATISTICS_USE);
    SqlNumericLiteral samplePercentLiteral = sqlAnalyzeTable.getSamplePercent();
    double samplePercent = samplePercentLiteral == null ? 100.0 : samplePercentLiteral.intValue(true);
    // Step 3: adds rel nodes for producing statistics analyze if required
    RelNode analyzeRel = useStatistics ? new DrillAnalyzeRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertToRawDrel(relNode), samplePercent) : convertToRawDrel(relBuilder.values(new String[] { "" }, "").build());
    MetadataControllerContext metadataControllerContext = MetadataControllerContext.builder().tableInfo(tableInfo).metastoreTableInfo(metastoreTableInfo).location(((FormatSelection) table.getSelection()).getSelection().getSelectionRoot()).interestingColumns(interestingColumns).segmentColumns(segmentColumns).metadataToHandle(allMetaToHandle).metadataToRemove(metadataToRemove).analyzeMetadataLevel(metadataLevel).build();
    convertedRelNode = new MetadataControllerRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertedRelNode, analyzeRel, metadataControllerContext);
    return new DrillScreenRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertedRelNode);
}
Also used : MetadataType(org.apache.drill.metastore.metadata.MetadataType) Arrays(java.util.Arrays) UserException(org.apache.drill.common.exceptions.UserException) DrillRel(org.apache.drill.exec.planner.logical.DrillRel) LoggerFactory(org.slf4j.LoggerFactory) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) ArrayListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap) MetadataHandlerContext(org.apache.drill.exec.metastore.analyze.MetadataHandlerContext) PhysicalOperator(org.apache.drill.exec.physical.base.PhysicalOperator) SqlNode(org.apache.calcite.sql.SqlNode) RelBuilder(org.apache.calcite.tools.RelBuilder) FieldReference(org.apache.drill.common.expression.FieldReference) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) SqlSelect(org.apache.calcite.sql.SqlSelect) BasicTablesRequests(org.apache.drill.metastore.components.tables.BasicTablesRequests) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) Collectors(java.util.stream.Collectors) List(java.util.List) ValidationException(org.apache.calcite.tools.ValidationException) ForemanSetupException(org.apache.drill.exec.work.foreman.ForemanSetupException) SqlNumericLiteral(org.apache.calcite.sql.SqlNumericLiteral) MetadataInfoCollector(org.apache.drill.exec.metastore.analyze.MetadataInfoCollector) ExecConstants(org.apache.drill.exec.ExecConstants) SqlMetastoreAnalyzeTable(org.apache.drill.exec.planner.sql.parser.SqlMetastoreAnalyzeTable) MetadataAggregateContext(org.apache.drill.exec.metastore.analyze.MetadataAggregateContext) TableScan(org.apache.calcite.rel.core.TableScan) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreException(org.apache.drill.metastore.exceptions.MetastoreException) MetadataControllerContext(org.apache.drill.exec.metastore.analyze.MetadataControllerContext) Pointer(org.apache.drill.exec.util.Pointer) DrillTable(org.apache.drill.exec.planner.logical.DrillTable) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) ArrayList(java.util.ArrayList) SqlLiteral(org.apache.calcite.sql.SqlLiteral) SqlUnsupportedException(org.apache.drill.exec.work.foreman.SqlUnsupportedException) NamedExpression(org.apache.drill.common.logical.data.NamedExpression) MetadataAggRel(org.apache.drill.exec.planner.logical.MetadataAggRel) DrillRelOptUtil(org.apache.drill.exec.planner.common.DrillRelOptUtil) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) SqlIdentifier(org.apache.calcite.sql.SqlIdentifier) MetadataHandlerRel(org.apache.drill.exec.planner.logical.MetadataHandlerRel) CheckedSupplier(org.apache.drill.common.util.function.CheckedSupplier) RelDataType(org.apache.calcite.rel.type.RelDataType) SqlParserPos(org.apache.calcite.sql.parser.SqlParserPos) Logger(org.slf4j.Logger) DrillAnalyzeRel(org.apache.drill.exec.planner.logical.DrillAnalyzeRel) IOException(java.io.IOException) RelNode(org.apache.calcite.rel.RelNode) Prel(org.apache.drill.exec.planner.physical.Prel) AbstractSchema(org.apache.drill.exec.store.AbstractSchema) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) RelConversionException(org.apache.calcite.tools.RelConversionException) Strings(org.apache.parquet.Strings) DrillScreenRel(org.apache.drill.exec.planner.logical.DrillScreenRel) PhysicalPlan(org.apache.drill.exec.physical.PhysicalPlan) LOGICAL_BUILDER(org.apache.drill.exec.planner.logical.DrillRelFactories.LOGICAL_BUILDER) SqlNodeList(org.apache.calcite.sql.SqlNodeList) MetadataControllerRel(org.apache.drill.exec.planner.logical.MetadataControllerRel) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) Collections(java.util.Collections) AnalyzeInfoProvider(org.apache.drill.exec.metastore.analyze.AnalyzeInfoProvider) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) DrillTable(org.apache.drill.exec.planner.logical.DrillTable) AnalyzeInfoProvider(org.apache.drill.exec.metastore.analyze.AnalyzeInfoProvider) ArrayList(java.util.ArrayList) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) BasicTablesRequests(org.apache.drill.metastore.components.tables.BasicTablesRequests) DrillAnalyzeRel(org.apache.drill.exec.planner.logical.DrillAnalyzeRel) SchemaPath(org.apache.drill.common.expression.SchemaPath) SqlUnsupportedException(org.apache.drill.exec.work.foreman.SqlUnsupportedException) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) MetadataHandlerContext(org.apache.drill.exec.metastore.analyze.MetadataHandlerContext) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) MetadataControllerRel(org.apache.drill.exec.planner.logical.MetadataControllerRel) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) TableScan(org.apache.calcite.rel.core.TableScan) RelBuilder(org.apache.calcite.tools.RelBuilder) MetastoreException(org.apache.drill.metastore.exceptions.MetastoreException) MetadataType(org.apache.drill.metastore.metadata.MetadataType) DrillScreenRel(org.apache.drill.exec.planner.logical.DrillScreenRel) RelNode(org.apache.calcite.rel.RelNode) NamedExpression(org.apache.drill.common.logical.data.NamedExpression) DrillRel(org.apache.drill.exec.planner.logical.DrillRel) MetadataInfoCollector(org.apache.drill.exec.metastore.analyze.MetadataInfoCollector) MetadataControllerContext(org.apache.drill.exec.metastore.analyze.MetadataControllerContext) SqlNumericLiteral(org.apache.calcite.sql.SqlNumericLiteral)

Example 4 with Multimap

use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.

the class MetadataControllerBatch method getColumnStatistics.

private Map<SchemaPath, ColumnStatistics<?>> getColumnStatistics(TupleReader reader, TupleMetadata columnMetadata, Long rowCount) {
    Multimap<String, StatisticsHolder<?>> columnStatistics = ArrayListMultimap.create();
    Map<String, TypeProtos.MinorType> columnTypes = new HashMap<>();
    for (ColumnMetadata column : columnMetadata) {
        if (AnalyzeColumnUtils.isColumnStatisticsField(column.name())) {
            String fieldName = AnalyzeColumnUtils.getColumnName(column.name());
            StatisticsKind<?> statisticsKind = AnalyzeColumnUtils.getStatisticsKind(column.name());
            columnStatistics.put(fieldName, new StatisticsHolder<>(getConvertedColumnValue(reader.column(column.name())), statisticsKind));
            if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MIN_VALUE.getName()) || statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MAX_VALUE.getName())) {
                columnTypes.putIfAbsent(fieldName, column.type());
            }
        }
    }
    // adds NON_NULL_COUNT to use it during filter pushdown
    if (rowCount != null) {
        Map<String, StatisticsHolder<?>> nullsCountColumnStatistics = new HashMap<>();
        columnStatistics.asMap().forEach((key, value) -> value.stream().filter(statisticsHolder -> statisticsHolder.getStatisticsKind() == ColumnStatisticsKind.NON_NULL_VALUES_COUNT).findAny().map(statisticsHolder -> (Long) statisticsHolder.getStatisticsValue()).ifPresent(nonNullCount -> nullsCountColumnStatistics.put(key, new StatisticsHolder<>(rowCount - nonNullCount, ColumnStatisticsKind.NULLS_COUNT))));
        nullsCountColumnStatistics.forEach(columnStatistics::put);
    }
    Map<SchemaPath, ColumnStatistics<?>> resultingStats = new HashMap<>();
    columnStatistics.asMap().forEach((fieldName, statisticsHolders) -> resultingStats.put(SchemaPath.parseFromString(fieldName), new ColumnStatistics<>(statisticsHolders, columnTypes.get(fieldName))));
    return resultingStats;
}
Also used : MetadataType(org.apache.drill.metastore.metadata.MetadataType) BaseStatisticsKind(org.apache.drill.metastore.statistics.BaseStatisticsKind) MetastoreColumn(org.apache.drill.metastore.MetastoreColumn) UserException(org.apache.drill.common.exceptions.UserException) LoggerFactory(org.slf4j.LoggerFactory) Types(org.apache.drill.common.types.Types) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) RowSetReader(org.apache.drill.exec.physical.rowSet.RowSetReader) VectorContainer(org.apache.drill.exec.record.VectorContainer) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) ArrayListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap) StringUtils(org.apache.commons.lang3.StringUtils) ArrayReader(org.apache.drill.exec.vector.accessor.ArrayReader) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) StatisticsRecordWriterImpl(org.apache.drill.exec.store.StatisticsRecordWriterImpl) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) Map(java.util.Map) FieldConverter(org.apache.drill.exec.store.EventBasedRecordWriter.FieldConverter) Path(org.apache.hadoop.fs.Path) BatchSchema(org.apache.drill.exec.record.BatchSchema) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) Delete(org.apache.drill.metastore.operate.Delete) TableMetadataUnit(org.apache.drill.metastore.components.tables.TableMetadataUnit) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) RecordBatch(org.apache.drill.exec.record.RecordBatch) Set(java.util.Set) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) Collectors(java.util.stream.Collectors) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) FieldReader(org.apache.drill.exec.vector.complex.reader.FieldReader) TypeProtos(org.apache.drill.common.types.TypeProtos) List(java.util.List) AbstractBinaryRecordBatch(org.apache.drill.exec.record.AbstractBinaryRecordBatch) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) ObjectReader(org.apache.drill.exec.vector.accessor.ObjectReader) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetadataIdentifierUtils(org.apache.drill.exec.metastore.analyze.MetadataIdentifierUtils) TupleReader(org.apache.drill.exec.vector.accessor.TupleReader) Modify(org.apache.drill.metastore.operate.Modify) MetadataControllerContext(org.apache.drill.exec.metastore.analyze.MetadataControllerContext) HashMap(java.util.HashMap) BitVector(org.apache.drill.exec.vector.BitVector) Function(java.util.function.Function) VectorWrapper(org.apache.drill.exec.record.VectorWrapper) ArrayList(java.util.ArrayList) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) HashSet(java.util.HashSet) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) DirectRowSet(org.apache.drill.exec.physical.rowSet.DirectRowSet) DrillStatsTable(org.apache.drill.exec.planner.common.DrillStatsTable) WriterPrel(org.apache.drill.exec.planner.physical.WriterPrel) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) FragmentContext(org.apache.drill.exec.ops.FragmentContext) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) StatisticsRecordCollector(org.apache.drill.exec.store.StatisticsRecordCollector) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) Logger(org.slf4j.Logger) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) StatisticsKind(org.apache.drill.metastore.statistics.StatisticsKind) IOException(java.io.IOException) FilterExpression(org.apache.drill.metastore.expressions.FilterExpression) StatisticsCollectorImpl(org.apache.drill.exec.store.easy.json.StatisticsCollectorImpl) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) ParquetTableMetadataUtils(org.apache.drill.exec.store.parquet.ParquetTableMetadataUtils) VarCharVector(org.apache.drill.exec.vector.VarCharVector) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) MetadataControllerPOP(org.apache.drill.exec.physical.config.MetadataControllerPOP) Tables(org.apache.drill.metastore.components.tables.Tables) Collections(java.util.Collections) AnalyzeColumnUtils(org.apache.drill.exec.metastore.analyze.AnalyzeColumnUtils) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) ObjectType(org.apache.drill.exec.vector.accessor.ObjectType) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) HashMap(java.util.HashMap) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath)

Example 5 with Multimap

use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.

the class ConvertMetadataAggregateToDirectScanRule method populateRecords.

/**
 * Populates records list with row group metadata.
 */
private DirectGroupScan populateRecords(Collection<SchemaPath> interestingColumns, Map<String, Class<?>> schema, DrillScanRel scan, ColumnNamesOptions columnNamesOptions) throws IOException {
    ParquetGroupScan parquetGroupScan = (ParquetGroupScan) scan.getGroupScan();
    DrillTable drillTable = Utilities.getDrillTable(scan.getTable());
    Multimap<Path, RowGroupMetadata> rowGroupsMetadataMap = parquetGroupScan.getMetadataProvider().getRowGroupsMetadataMap();
    Table<String, Integer, Object> recordsTable = HashBasedTable.create();
    FormatSelection selection = (FormatSelection) drillTable.getSelection();
    List<String> partitionColumnNames = ColumnExplorer.getPartitionColumnNames(selection.getSelection(), columnNamesOptions);
    FileSystem rawFs = selection.getSelection().getSelectionRoot().getFileSystem(new Configuration());
    DrillFileSystem fileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), rawFs.getConf());
    int rowIndex = 0;
    for (Map.Entry<Path, RowGroupMetadata> rgEntry : rowGroupsMetadataMap.entries()) {
        Path path = rgEntry.getKey();
        RowGroupMetadata rowGroupMetadata = rgEntry.getValue();
        List<String> partitionValues = ColumnExplorer.listPartitionValues(path, selection.getSelection().getSelectionRoot(), false);
        for (int i = 0; i < partitionValues.size(); i++) {
            String partitionColumnName = partitionColumnNames.get(i);
            recordsTable.put(partitionColumnName, rowIndex, partitionValues.get(i));
        }
        recordsTable.put(MetastoreAnalyzeConstants.LOCATION_FIELD, rowIndex, ImplicitFileColumns.FQN.getValue(path));
        recordsTable.put(columnNamesOptions.rowGroupIndex(), rowIndex, String.valueOf(rowGroupMetadata.getRowGroupIndex()));
        if (interestingColumns == null) {
            interestingColumns = rowGroupMetadata.getColumnsStatistics().keySet();
        }
        // populates record list with row group column metadata
        for (SchemaPath schemaPath : interestingColumns) {
            ColumnStatistics<?> columnStatistics = rowGroupMetadata.getColumnsStatistics().get(schemaPath);
            // do not gather statistics for array columns as it is not supported by Metastore
            if (containsArrayColumn(rowGroupMetadata.getSchema(), schemaPath)) {
                continue;
            }
            if (IsPredicate.isNullOrEmpty(columnStatistics)) {
                logger.debug("Statistics for {} column wasn't found within {} row group.", schemaPath, path);
                return null;
            }
            for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet()) {
                Object statsValue;
                if (statisticsKind.getName().equalsIgnoreCase(TableStatisticsKind.ROW_COUNT.getName())) {
                    statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata);
                } else if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.NON_NULL_VALUES_COUNT.getName())) {
                    statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata) - ColumnStatisticsKind.NULLS_COUNT.getFrom(columnStatistics);
                } else {
                    statsValue = columnStatistics.get(statisticsKind);
                }
                String columnStatisticsFieldName = AnalyzeColumnUtils.getColumnStatisticsFieldName(schemaPath.toExpr(), statisticsKind);
                if (statsValue != null) {
                    schema.putIfAbsent(columnStatisticsFieldName, statsValue.getClass());
                    recordsTable.put(columnStatisticsFieldName, rowIndex, statsValue);
                } else {
                    recordsTable.put(columnStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
                }
            }
        }
        // populates record list with row group metadata
        for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet()) {
            String metadataStatisticsFieldName = AnalyzeColumnUtils.getMetadataStatisticsFieldName(statisticsKind);
            Object statisticsValue = rowGroupMetadata.getStatistic(statisticsKind);
            if (statisticsValue != null) {
                schema.putIfAbsent(metadataStatisticsFieldName, statisticsValue.getClass());
                recordsTable.put(metadataStatisticsFieldName, rowIndex, statisticsValue);
            } else {
                recordsTable.put(metadataStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
            }
        }
        // populates record list internal columns
        recordsTable.put(MetastoreAnalyzeConstants.SCHEMA_FIELD, rowIndex, rowGroupMetadata.getSchema().jsonString());
        recordsTable.put(columnNamesOptions.rowGroupStart(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.START)));
        recordsTable.put(columnNamesOptions.rowGroupLength(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
        recordsTable.put(columnNamesOptions.lastModifiedTime(), rowIndex, String.valueOf(fileSystem.getFileStatus(path).getModificationTime()));
        rowIndex++;
    }
    // DynamicPojoRecordReader requires LinkedHashMap with fields order
    // which corresponds to the value position in record list.
    LinkedHashMap<String, Class<?>> orderedSchema = new LinkedHashMap<>();
    for (String s : recordsTable.rowKeySet()) {
        Class<?> clazz = schema.get(s);
        if (clazz != null) {
            orderedSchema.put(s, clazz);
        } else {
            return null;
        }
    }
    IntFunction<List<Object>> collectRecord = currentIndex -> orderedSchema.keySet().stream().map(column -> recordsTable.get(column, currentIndex)).map(value -> value != BaseParquetMetadataProvider.NULL_VALUE ? value : null).collect(Collectors.toList());
    List<List<Object>> records = IntStream.range(0, rowIndex).mapToObj(collectRecord).collect(Collectors.toList());
    DynamicPojoRecordReader<?> reader = new DynamicPojoRecordReader<>(orderedSchema, records);
    ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, records.size(), 1, schema.size());
    return new DirectGroupScan(reader, scanStats);
}
Also used : MetadataType(org.apache.drill.metastore.metadata.MetadataType) FileSystem(org.apache.hadoop.fs.FileSystem) IsPredicate(org.apache.drill.exec.expr.IsPredicate) LoggerFactory(org.slf4j.LoggerFactory) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) DictColumnMetadata(org.apache.drill.exec.record.metadata.DictColumnMetadata) PathSegment(org.apache.drill.common.expression.PathSegment) Utilities(org.apache.drill.exec.util.Utilities) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) BaseParquetMetadataProvider(org.apache.drill.exec.store.parquet.BaseParquetMetadataProvider) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) ImplicitFileColumns(org.apache.drill.exec.store.ColumnExplorer.ImplicitFileColumns) Collectors(java.util.stream.Collectors) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) List(java.util.List) MetadataAggregateContext(org.apache.drill.exec.metastore.analyze.MetadataAggregateContext) IntStream(java.util.stream.IntStream) Table(org.apache.drill.shaded.guava.com.google.common.collect.Table) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) Function(java.util.function.Function) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) LinkedHashMap(java.util.LinkedHashMap) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) ImpersonationUtil(org.apache.drill.exec.util.ImpersonationUtil) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) ParquetGroupScan(org.apache.drill.exec.store.parquet.ParquetGroupScan) IntFunction(java.util.function.IntFunction) PrelUtil(org.apache.drill.exec.planner.physical.PrelUtil) Logger(org.slf4j.Logger) ScanStats(org.apache.drill.exec.physical.base.ScanStats) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) StatisticsKind(org.apache.drill.metastore.statistics.StatisticsKind) HashBasedTable(org.apache.drill.shaded.guava.com.google.common.collect.HashBasedTable) IOException(java.io.IOException) RelNode(org.apache.calcite.rel.RelNode) RelOptRuleCall(org.apache.calcite.plan.RelOptRuleCall) DirectGroupScan(org.apache.drill.exec.store.direct.DirectGroupScan) RelOptRule(org.apache.calcite.plan.RelOptRule) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) GroupScan(org.apache.drill.exec.physical.base.GroupScan) AnalyzeColumnUtils(org.apache.drill.exec.metastore.analyze.AnalyzeColumnUtils) Configuration(org.apache.hadoop.conf.Configuration) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) LinkedHashMap(java.util.LinkedHashMap) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) SchemaPath(org.apache.drill.common.expression.SchemaPath) FileSystem(org.apache.hadoop.fs.FileSystem) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) List(java.util.List) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) DirectGroupScan(org.apache.drill.exec.store.direct.DirectGroupScan) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) ParquetGroupScan(org.apache.drill.exec.store.parquet.ParquetGroupScan) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ScanStats(org.apache.drill.exec.physical.base.ScanStats)

Aggregations

Multimap (org.apache.drill.shaded.guava.com.google.common.collect.Multimap)7 IOException (java.io.IOException)6 List (java.util.List)6 ArrayList (java.util.ArrayList)5 Map (java.util.Map)5 Collectors (java.util.stream.Collectors)5 SchemaPath (org.apache.drill.common.expression.SchemaPath)5 PlannerSettings (org.apache.drill.exec.planner.physical.PlannerSettings)5 MetadataType (org.apache.drill.metastore.metadata.MetadataType)5 ArrayListMultimap (org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap)5 Logger (org.slf4j.Logger)5 LoggerFactory (org.slf4j.LoggerFactory)5 Collection (java.util.Collection)4 Collections (java.util.Collections)4 Path (org.apache.hadoop.fs.Path)4 HashSet (java.util.HashSet)3 Function (java.util.function.Function)3 TableStatisticsKind (org.apache.drill.metastore.statistics.TableStatisticsKind)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Arrays (java.util.Arrays)2