Search in sources :

Example 16 with FormatSelection

use of org.apache.drill.exec.store.dfs.FormatSelection in project drill by apache.

the class ConvertMetadataAggregateToDirectScanRule method populateRecords.

/**
 * Populates records list with row group metadata.
 */
private DirectGroupScan populateRecords(Collection<SchemaPath> interestingColumns, Map<String, Class<?>> schema, DrillScanRel scan, ColumnNamesOptions columnNamesOptions) throws IOException {
    ParquetGroupScan parquetGroupScan = (ParquetGroupScan) scan.getGroupScan();
    DrillTable drillTable = Utilities.getDrillTable(scan.getTable());
    Multimap<Path, RowGroupMetadata> rowGroupsMetadataMap = parquetGroupScan.getMetadataProvider().getRowGroupsMetadataMap();
    Table<String, Integer, Object> recordsTable = HashBasedTable.create();
    FormatSelection selection = (FormatSelection) drillTable.getSelection();
    List<String> partitionColumnNames = ColumnExplorer.getPartitionColumnNames(selection.getSelection(), columnNamesOptions);
    FileSystem rawFs = selection.getSelection().getSelectionRoot().getFileSystem(new Configuration());
    DrillFileSystem fileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), rawFs.getConf());
    int rowIndex = 0;
    for (Map.Entry<Path, RowGroupMetadata> rgEntry : rowGroupsMetadataMap.entries()) {
        Path path = rgEntry.getKey();
        RowGroupMetadata rowGroupMetadata = rgEntry.getValue();
        List<String> partitionValues = ColumnExplorer.listPartitionValues(path, selection.getSelection().getSelectionRoot(), false);
        for (int i = 0; i < partitionValues.size(); i++) {
            String partitionColumnName = partitionColumnNames.get(i);
            recordsTable.put(partitionColumnName, rowIndex, partitionValues.get(i));
        }
        recordsTable.put(MetastoreAnalyzeConstants.LOCATION_FIELD, rowIndex, ImplicitFileColumns.FQN.getValue(path));
        recordsTable.put(columnNamesOptions.rowGroupIndex(), rowIndex, String.valueOf(rowGroupMetadata.getRowGroupIndex()));
        if (interestingColumns == null) {
            interestingColumns = rowGroupMetadata.getColumnsStatistics().keySet();
        }
        // populates record list with row group column metadata
        for (SchemaPath schemaPath : interestingColumns) {
            ColumnStatistics<?> columnStatistics = rowGroupMetadata.getColumnsStatistics().get(schemaPath);
            // do not gather statistics for array columns as it is not supported by Metastore
            if (containsArrayColumn(rowGroupMetadata.getSchema(), schemaPath)) {
                continue;
            }
            if (IsPredicate.isNullOrEmpty(columnStatistics)) {
                logger.debug("Statistics for {} column wasn't found within {} row group.", schemaPath, path);
                return null;
            }
            for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet()) {
                Object statsValue;
                if (statisticsKind.getName().equalsIgnoreCase(TableStatisticsKind.ROW_COUNT.getName())) {
                    statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata);
                } else if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.NON_NULL_VALUES_COUNT.getName())) {
                    statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata) - ColumnStatisticsKind.NULLS_COUNT.getFrom(columnStatistics);
                } else {
                    statsValue = columnStatistics.get(statisticsKind);
                }
                String columnStatisticsFieldName = AnalyzeColumnUtils.getColumnStatisticsFieldName(schemaPath.toExpr(), statisticsKind);
                if (statsValue != null) {
                    schema.putIfAbsent(columnStatisticsFieldName, statsValue.getClass());
                    recordsTable.put(columnStatisticsFieldName, rowIndex, statsValue);
                } else {
                    recordsTable.put(columnStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
                }
            }
        }
        // populates record list with row group metadata
        for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet()) {
            String metadataStatisticsFieldName = AnalyzeColumnUtils.getMetadataStatisticsFieldName(statisticsKind);
            Object statisticsValue = rowGroupMetadata.getStatistic(statisticsKind);
            if (statisticsValue != null) {
                schema.putIfAbsent(metadataStatisticsFieldName, statisticsValue.getClass());
                recordsTable.put(metadataStatisticsFieldName, rowIndex, statisticsValue);
            } else {
                recordsTable.put(metadataStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
            }
        }
        // populates record list internal columns
        recordsTable.put(MetastoreAnalyzeConstants.SCHEMA_FIELD, rowIndex, rowGroupMetadata.getSchema().jsonString());
        recordsTable.put(columnNamesOptions.rowGroupStart(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.START)));
        recordsTable.put(columnNamesOptions.rowGroupLength(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
        recordsTable.put(columnNamesOptions.lastModifiedTime(), rowIndex, String.valueOf(fileSystem.getFileStatus(path).getModificationTime()));
        rowIndex++;
    }
    // DynamicPojoRecordReader requires LinkedHashMap with fields order
    // which corresponds to the value position in record list.
    LinkedHashMap<String, Class<?>> orderedSchema = new LinkedHashMap<>();
    for (String s : recordsTable.rowKeySet()) {
        Class<?> clazz = schema.get(s);
        if (clazz != null) {
            orderedSchema.put(s, clazz);
        } else {
            return null;
        }
    }
    IntFunction<List<Object>> collectRecord = currentIndex -> orderedSchema.keySet().stream().map(column -> recordsTable.get(column, currentIndex)).map(value -> value != BaseParquetMetadataProvider.NULL_VALUE ? value : null).collect(Collectors.toList());
    List<List<Object>> records = IntStream.range(0, rowIndex).mapToObj(collectRecord).collect(Collectors.toList());
    DynamicPojoRecordReader<?> reader = new DynamicPojoRecordReader<>(orderedSchema, records);
    ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, records.size(), 1, schema.size());
    return new DirectGroupScan(reader, scanStats);
}
Also used : MetadataType(org.apache.drill.metastore.metadata.MetadataType) FileSystem(org.apache.hadoop.fs.FileSystem) IsPredicate(org.apache.drill.exec.expr.IsPredicate) LoggerFactory(org.slf4j.LoggerFactory) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) DictColumnMetadata(org.apache.drill.exec.record.metadata.DictColumnMetadata) PathSegment(org.apache.drill.common.expression.PathSegment) Utilities(org.apache.drill.exec.util.Utilities) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) BaseParquetMetadataProvider(org.apache.drill.exec.store.parquet.BaseParquetMetadataProvider) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) ImplicitFileColumns(org.apache.drill.exec.store.ColumnExplorer.ImplicitFileColumns) Collectors(java.util.stream.Collectors) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) List(java.util.List) MetadataAggregateContext(org.apache.drill.exec.metastore.analyze.MetadataAggregateContext) IntStream(java.util.stream.IntStream) Table(org.apache.drill.shaded.guava.com.google.common.collect.Table) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) Function(java.util.function.Function) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) LinkedHashMap(java.util.LinkedHashMap) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) ImpersonationUtil(org.apache.drill.exec.util.ImpersonationUtil) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) ParquetGroupScan(org.apache.drill.exec.store.parquet.ParquetGroupScan) IntFunction(java.util.function.IntFunction) PrelUtil(org.apache.drill.exec.planner.physical.PrelUtil) Logger(org.slf4j.Logger) ScanStats(org.apache.drill.exec.physical.base.ScanStats) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) StatisticsKind(org.apache.drill.metastore.statistics.StatisticsKind) HashBasedTable(org.apache.drill.shaded.guava.com.google.common.collect.HashBasedTable) IOException(java.io.IOException) RelNode(org.apache.calcite.rel.RelNode) RelOptRuleCall(org.apache.calcite.plan.RelOptRuleCall) DirectGroupScan(org.apache.drill.exec.store.direct.DirectGroupScan) RelOptRule(org.apache.calcite.plan.RelOptRule) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) GroupScan(org.apache.drill.exec.physical.base.GroupScan) AnalyzeColumnUtils(org.apache.drill.exec.metastore.analyze.AnalyzeColumnUtils) Configuration(org.apache.hadoop.conf.Configuration) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) LinkedHashMap(java.util.LinkedHashMap) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) SchemaPath(org.apache.drill.common.expression.SchemaPath) FileSystem(org.apache.hadoop.fs.FileSystem) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) List(java.util.List) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) DirectGroupScan(org.apache.drill.exec.store.direct.DirectGroupScan) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) ParquetGroupScan(org.apache.drill.exec.store.parquet.ParquetGroupScan) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ScanStats(org.apache.drill.exec.physical.base.ScanStats)

Example 17 with FormatSelection

use of org.apache.drill.exec.store.dfs.FormatSelection in project drill by apache.

the class ConvertCountToDirectScanRule method onMatch.

@Override
public void onMatch(RelOptRuleCall call) {
    final Aggregate agg = call.rel(0);
    final TableScan scan = call.rel(call.rels.length - 1);
    final Project project = call.rels.length == 3 ? (Project) call.rel(1) : null;
    // 3) Additional checks are done further below ..
    if (agg.getGroupCount() > 0 || agg.containsDistinctCall()) {
        return;
    }
    DrillTable drillTable = DrillRelOptUtil.getDrillTable(scan);
    if (drillTable == null) {
        logger.debug("Rule does not apply since an eligible drill table instance was not found.");
        return;
    }
    Object selection = drillTable.getSelection();
    if (!(selection instanceof FormatSelection)) {
        logger.debug("Rule does not apply since only Parquet file format is eligible.");
        return;
    }
    PlannerSettings settings = call.getPlanner().getContext().unwrap(PlannerSettings.class);
    // Rule is applicable only if the statistics for row count and null count are available from the metadata,
    FormatSelection formatSelection = (FormatSelection) selection;
    // Rule cannot be applied if the selection had wildcard since the totalrowcount cannot be read from the parent directory
    if (formatSelection.getSelection().hadWildcard()) {
        logger.debug("Rule does not apply when there is a wild card since the COUNT could not be determined from metadata.");
        return;
    }
    Pair<Boolean, Metadata_V4.MetadataSummary> status = checkMetadataForScanStats(settings, drillTable, formatSelection);
    if (!status.getLeft()) {
        logger.debug("Rule does not apply since MetadataSummary metadata was not found.");
        return;
    }
    Metadata_V4.MetadataSummary metadataSummary = status.getRight();
    Map<String, Long> result = collectCounts(settings, metadataSummary, agg, scan, project);
    logger.trace("Calculated the following aggregate counts: {}", result);
    // if counts could not be determined, rule won't be applied
    if (result.isEmpty()) {
        logger.debug("Rule does not apply since one or more COUNTs could not be determined from metadata.");
        return;
    }
    Path summaryFileName = Metadata.getSummaryFileName(formatSelection.getSelection().getSelectionRoot());
    final RelDataType scanRowType = CountToDirectScanUtils.constructDataType(agg, result.keySet());
    final DynamicPojoRecordReader<Long> reader = new DynamicPojoRecordReader<>(CountToDirectScanUtils.buildSchema(scanRowType.getFieldNames()), Collections.singletonList(new ArrayList<>(result.values())));
    final ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, 1, 1, scanRowType.getFieldCount());
    final MetadataDirectGroupScan directScan = new MetadataDirectGroupScan(reader, summaryFileName, 1, scanStats, true, false);
    final DrillDirectScanRel newScan = new DrillDirectScanRel(scan.getCluster(), scan.getTraitSet().plus(DrillRel.DRILL_LOGICAL), directScan, scanRowType);
    final DrillProjectRel newProject = new DrillProjectRel(agg.getCluster(), agg.getTraitSet().plus(DrillRel.DRILL_LOGICAL), newScan, CountToDirectScanUtils.prepareFieldExpressions(scanRowType), agg.getRowType());
    call.transformTo(newProject);
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) TableScan(org.apache.calcite.rel.core.TableScan) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) ArrayList(java.util.ArrayList) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) RelDataType(org.apache.calcite.rel.type.RelDataType) Project(org.apache.calcite.rel.core.Project) MetadataDirectGroupScan(org.apache.drill.exec.store.direct.MetadataDirectGroupScan) Metadata_V4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4) Aggregate(org.apache.calcite.rel.core.Aggregate) ScanStats(org.apache.drill.exec.physical.base.ScanStats)

Example 18 with FormatSelection

use of org.apache.drill.exec.store.dfs.FormatSelection in project drill by apache.

the class RefreshMetadataHandler method getPlan.

@Override
public PhysicalPlan getPlan(SqlNode sqlNode) throws ForemanSetupException {
    final SqlRefreshMetadata refreshTable = unwrap(sqlNode, SqlRefreshMetadata.class);
    try {
        final SchemaPlus schema = findSchema(config.getConverter().getDefaultSchema(), refreshTable.getSchemaPath());
        if (schema == null) {
            return direct(false, "Storage plugin or workspace does not exist [%s]", SchemaUtilites.SCHEMA_PATH_JOINER.join(refreshTable.getSchemaPath()));
        }
        final String tableName = refreshTable.getName();
        final SqlNodeList columnList = getColumnList(refreshTable);
        final Set<SchemaPath> columnSet = getColumnRootSegments(columnList);
        final SqlLiteral allColumns = refreshTable.getAllColumns();
        if (tableName.contains("*") || tableName.contains("?")) {
            return direct(false, "Glob path %s not supported for metadata refresh", tableName);
        }
        final Table table = schema.getTable(tableName);
        if (table == null) {
            return direct(false, "Table %s does not exist.", tableName);
        }
        if (!(table instanceof DrillTable)) {
            return notSupported(tableName);
        }
        final DrillTable drillTable = (DrillTable) table;
        final Object selection = drillTable.getSelection();
        if (selection instanceof FileSelection && ((FileSelection) selection).isEmptyDirectory()) {
            return direct(false, "Table %s is empty and doesn't contain any parquet files.", tableName);
        }
        if (!(selection instanceof FormatSelection)) {
            return notSupported(tableName);
        }
        final FormatSelection formatSelection = (FormatSelection) selection;
        FormatPluginConfig formatConfig = formatSelection.getFormat();
        if (!((formatConfig instanceof ParquetFormatConfig) || ((formatConfig instanceof NamedFormatPluginConfig) && ((NamedFormatPluginConfig) formatConfig).getName().equals("parquet")))) {
            return notSupported(tableName);
        }
        // Always create filesystem object using process user, since it owns the metadata file
        final DrillFileSystem fs = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), drillTable.getPlugin().getFormatPlugin(formatConfig).getFsConf());
        final Path selectionRoot = formatSelection.getSelection().getSelectionRoot();
        if (!fs.getFileStatus(selectionRoot).isDirectory()) {
            return notSupported(tableName);
        }
        if (!(formatConfig instanceof ParquetFormatConfig)) {
            formatConfig = new ParquetFormatConfig();
        }
        final ParquetReaderConfig readerConfig = ParquetReaderConfig.builder().withFormatConfig((ParquetFormatConfig) formatConfig).withOptions(context.getOptions()).build();
        Metadata.createMeta(fs, selectionRoot, readerConfig, allColumns.booleanValue(), columnSet);
        return direct(true, "Successfully updated metadata for table %s.", tableName);
    } catch (Exception e) {
        logger.error("Failed to update metadata for table '{}'", refreshTable.getName(), e);
        return DirectPlan.createDirectPlan(context, false, String.format("Error: %s", e.getMessage()));
    }
}
Also used : FileSelection(org.apache.drill.exec.store.dfs.FileSelection) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) DrillTable(org.apache.drill.exec.planner.logical.DrillTable) Table(org.apache.calcite.schema.Table) DrillTable(org.apache.drill.exec.planner.logical.DrillTable) SchemaPlus(org.apache.calcite.schema.SchemaPlus) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) SqlRefreshMetadata(org.apache.drill.exec.planner.sql.parser.SqlRefreshMetadata) ForemanSetupException(org.apache.drill.exec.work.foreman.ForemanSetupException) NamedFormatPluginConfig(org.apache.drill.exec.store.dfs.NamedFormatPluginConfig) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) SchemaPath(org.apache.drill.common.expression.SchemaPath) FormatPluginConfig(org.apache.drill.common.logical.FormatPluginConfig) NamedFormatPluginConfig(org.apache.drill.exec.store.dfs.NamedFormatPluginConfig) SqlNodeList(org.apache.calcite.sql.SqlNodeList) ParquetFormatConfig(org.apache.drill.exec.store.parquet.ParquetFormatConfig) SqlLiteral(org.apache.calcite.sql.SqlLiteral) ParquetReaderConfig(org.apache.drill.exec.store.parquet.ParquetReaderConfig)

Example 19 with FormatSelection

use of org.apache.drill.exec.store.dfs.FormatSelection in project drill by apache.

the class IcebergFormatMatcher method isReadable.

@Override
public DrillTable isReadable(DrillFileSystem fs, FileSelection selection, FileSystemPlugin fsPlugin, String storageEngineName, SchemaConfig schemaConfig) throws IOException {
    Path selectionRoot = selection.getSelectionRoot();
    Path metaDir = new Path(selectionRoot, METADATA_DIR_NAME);
    if (fs.isDirectory(selectionRoot) && fs.exists(metaDir) && fs.isDirectory(metaDir)) {
        FormatSelection formatSelection = new FormatSelection(formatPlugin.getConfig(), selection);
        return new PluginDrillTable(fsPlugin, storageEngineName, schemaConfig.getUserName(), formatSelection, formatPlugin.getConvention());
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) PluginDrillTable(org.apache.drill.exec.store.plan.rel.PluginDrillTable)

Example 20 with FormatSelection

use of org.apache.drill.exec.store.dfs.FormatSelection in project drill by apache.

the class AnalyzeFileInfoProvider method getSegmentColumns.

@Override
public List<SchemaPath> getSegmentColumns(DrillTable table, ColumnNamesOptions columnNamesOptions) throws IOException {
    FormatSelection selection = (FormatSelection) table.getSelection();
    FileSelection fileSelection = selection.getSelection();
    if (!fileSelection.isExpandedFully()) {
        fileSelection = FileMetadataInfoCollector.getExpandedFileSelection(fileSelection);
    }
    return ColumnExplorer.getPartitionColumnNames(fileSelection, columnNamesOptions).stream().map(SchemaPath::getSimplePath).collect(Collectors.toList());
}
Also used : FileSelection(org.apache.drill.exec.store.dfs.FileSelection) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection)

Aggregations

FormatSelection (org.apache.drill.exec.store.dfs.FormatSelection)24 Path (org.apache.hadoop.fs.Path)14 SchemaPath (org.apache.drill.common.expression.SchemaPath)12 FileSelection (org.apache.drill.exec.store.dfs.FileSelection)12 PlannerSettings (org.apache.drill.exec.planner.physical.PlannerSettings)8 DrillScanRel (org.apache.drill.exec.planner.logical.DrillScanRel)7 DrillTable (org.apache.drill.exec.planner.logical.DrillTable)7 DrillFileSystem (org.apache.drill.exec.store.dfs.DrillFileSystem)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 RelNode (org.apache.calcite.rel.RelNode)6 DynamicDrillTable (org.apache.drill.exec.planner.logical.DynamicDrillTable)6 Collection (java.util.Collection)5 List (java.util.List)5 Collectors (java.util.stream.Collectors)5 MetadataType (org.apache.drill.metastore.metadata.MetadataType)5 Multimap (org.apache.drill.shaded.guava.com.google.common.collect.Multimap)5 Map (java.util.Map)4 EnumerableTableScan (org.apache.calcite.adapter.enumerable.EnumerableTableScan)4 FormatPluginConfig (org.apache.drill.common.logical.FormatPluginConfig)4