Search in sources :

Example 6 with MetadataSummary

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary in project drill by apache.

the class ConvertCountToDirectScanRule method onMatch.

@Override
public void onMatch(RelOptRuleCall call) {
    final Aggregate agg = call.rel(0);
    final TableScan scan = call.rel(call.rels.length - 1);
    final Project project = call.rels.length == 3 ? (Project) call.rel(1) : null;
    // 3) Additional checks are done further below ..
    if (agg.getGroupCount() > 0 || agg.containsDistinctCall()) {
        return;
    }
    DrillTable drillTable = DrillRelOptUtil.getDrillTable(scan);
    if (drillTable == null) {
        logger.debug("Rule does not apply since an eligible drill table instance was not found.");
        return;
    }
    Object selection = drillTable.getSelection();
    if (!(selection instanceof FormatSelection)) {
        logger.debug("Rule does not apply since only Parquet file format is eligible.");
        return;
    }
    PlannerSettings settings = call.getPlanner().getContext().unwrap(PlannerSettings.class);
    // Rule is applicable only if the statistics for row count and null count are available from the metadata,
    FormatSelection formatSelection = (FormatSelection) selection;
    // Rule cannot be applied if the selection had wildcard since the totalrowcount cannot be read from the parent directory
    if (formatSelection.getSelection().hadWildcard()) {
        logger.debug("Rule does not apply when there is a wild card since the COUNT could not be determined from metadata.");
        return;
    }
    Pair<Boolean, Metadata_V4.MetadataSummary> status = checkMetadataForScanStats(settings, drillTable, formatSelection);
    if (!status.getLeft()) {
        logger.debug("Rule does not apply since MetadataSummary metadata was not found.");
        return;
    }
    Metadata_V4.MetadataSummary metadataSummary = status.getRight();
    Map<String, Long> result = collectCounts(settings, metadataSummary, agg, scan, project);
    logger.trace("Calculated the following aggregate counts: {}", result);
    // if counts could not be determined, rule won't be applied
    if (result.isEmpty()) {
        logger.debug("Rule does not apply since one or more COUNTs could not be determined from metadata.");
        return;
    }
    Path summaryFileName = Metadata.getSummaryFileName(formatSelection.getSelection().getSelectionRoot());
    final RelDataType scanRowType = CountToDirectScanUtils.constructDataType(agg, result.keySet());
    final DynamicPojoRecordReader<Long> reader = new DynamicPojoRecordReader<>(CountToDirectScanUtils.buildSchema(scanRowType.getFieldNames()), Collections.singletonList(new ArrayList<>(result.values())));
    final ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, 1, 1, scanRowType.getFieldCount());
    final MetadataDirectGroupScan directScan = new MetadataDirectGroupScan(reader, summaryFileName, 1, scanStats, true, false);
    final DrillDirectScanRel newScan = new DrillDirectScanRel(scan.getCluster(), scan.getTraitSet().plus(DrillRel.DRILL_LOGICAL), directScan, scanRowType);
    final DrillProjectRel newProject = new DrillProjectRel(agg.getCluster(), agg.getTraitSet().plus(DrillRel.DRILL_LOGICAL), newScan, CountToDirectScanUtils.prepareFieldExpressions(scanRowType), agg.getRowType());
    call.transformTo(newProject);
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) TableScan(org.apache.calcite.rel.core.TableScan) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) ArrayList(java.util.ArrayList) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) RelDataType(org.apache.calcite.rel.type.RelDataType) Project(org.apache.calcite.rel.core.Project) MetadataDirectGroupScan(org.apache.drill.exec.store.direct.MetadataDirectGroupScan) Metadata_V4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4) Aggregate(org.apache.calcite.rel.core.Aggregate) ScanStats(org.apache.drill.exec.physical.base.ScanStats)

Example 7 with MetadataSummary

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary in project drill by apache.

the class ConvertCountToDirectScanRule method checkMetadataForScanStats.

private Pair<Boolean, Metadata_V4.MetadataSummary> checkMetadataForScanStats(PlannerSettings settings, DrillTable drillTable, FormatSelection formatSelection) {
    // Currently only support metadata rowcount stats for Parquet tables
    FormatPluginConfig formatConfig = formatSelection.getFormat();
    if (!((formatConfig instanceof ParquetFormatConfig) || ((formatConfig instanceof NamedFormatPluginConfig) && ((NamedFormatPluginConfig) formatConfig).getName().equals("parquet")))) {
        return new ImmutablePair<>(false, null);
    }
    FileSystemPlugin plugin = (FileSystemPlugin) drillTable.getPlugin();
    DrillFileSystem fs;
    try {
        fs = new DrillFileSystem(plugin.getFormatPlugin(formatSelection.getFormat()).getFsConf());
    } catch (IOException e) {
        logger.warn("Unable to create the file system object for retrieving statistics from metadata cache file ", e);
        return new ImmutablePair<>(false, null);
    }
    // check if the cacheFileRoot has been set: this is needed because after directory pruning, the
    // cacheFileRoot could have been changed and not be the same as the original selectionRoot
    Path selectionRoot = formatSelection.getSelection().getCacheFileRoot() != null ? formatSelection.getSelection().getCacheFileRoot() : formatSelection.getSelection().getSelectionRoot();
    ParquetReaderConfig parquetReaderConfig = ParquetReaderConfig.builder().withFormatConfig((ParquetFormatConfig) formatConfig).withOptions(settings.getOptions()).build();
    Metadata_V4.MetadataSummary metadataSummary = Metadata.getSummary(fs, selectionRoot, false, parquetReaderConfig);
    return metadataSummary != null ? new ImmutablePair<>(true, metadataSummary) : new ImmutablePair<>(false, null);
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) FileSystemPlugin(org.apache.drill.exec.store.dfs.FileSystemPlugin) Metadata_V4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) FormatPluginConfig(org.apache.drill.common.logical.FormatPluginConfig) NamedFormatPluginConfig(org.apache.drill.exec.store.dfs.NamedFormatPluginConfig) IOException(java.io.IOException) ParquetFormatConfig(org.apache.drill.exec.store.parquet.ParquetFormatConfig) NamedFormatPluginConfig(org.apache.drill.exec.store.dfs.NamedFormatPluginConfig) ParquetReaderConfig(org.apache.drill.exec.store.parquet.ParquetReaderConfig)

Aggregations

SchemaPath (org.apache.drill.common.expression.SchemaPath)6 Path (org.apache.hadoop.fs.Path)5 MetadataSummary (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary)4 ParquetTableMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4)4 IOException (java.io.IOException)3 Metadata_V4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4)3 ColumnTypeMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4)3 ParquetFileAndRowCountMetadata (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 SimpleModule (com.fasterxml.jackson.databind.module.SimpleModule)2 AfterburnerModule (com.fasterxml.jackson.module.afterburner.AfterburnerModule)2 InputStream (java.io.InputStream)2 ArrayList (java.util.ArrayList)2 LinkedHashMap (java.util.LinkedHashMap)2 ParquetFileMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata)2 FileMetadata (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.FileMetadata)2 ParquetFileMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)2 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 Aggregate (org.apache.calcite.rel.core.Aggregate)1