Search in sources :

Example 1 with TableMetadata

use of org.apache.drill.metastore.metadata.TableMetadata in project drill by apache.

the class BaseParquetMetadataProvider method init.

protected void init(BaseParquetMetadataProvider metadataProvider) throws IOException {
    // Once deserialization for metadata is provided, initInternal() call should be removed
    // and only files list is deserialized based on specified locations
    initInternal();
    assert parquetTableMetadata != null;
    if (fileSet == null) {
        fileSet = new HashSet<>();
        fileSet.addAll(parquetTableMetadata.getFiles().stream().map(MetadataBase.ParquetFileMetadata::getPath).collect(Collectors.toSet()));
    }
    List<Path> fileLocations = getLocations();
    // obtains metadata from cache files or table footers
    if (metadataProvider == null || (metadataProvider.rowGroups != null && !metadataProvider.rowGroups.keySet().containsAll(fileLocations)) || (metadataProvider.files != null && !metadataProvider.files.keySet().containsAll(fileLocations))) {
        initializeMetadata();
    } else {
        // reuse metadata from existing TableMetadataProvider
        if (metadataProvider.files != null && metadataProvider.files.size() != files.size()) {
            files = metadataProvider.files.entrySet().stream().filter(entry -> fileLocations.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
        }
        if (metadataProvider.rowGroups != null) {
            rowGroups = LinkedListMultimap.create();
            metadataProvider.rowGroups.entries().stream().filter(entry -> fileLocations.contains(entry.getKey())).forEach(entry -> rowGroups.put(entry.getKey(), entry.getValue()));
        }
        TableMetadata tableMetadata = getTableMetadata();
        getSegmentsMetadataMap();
        getPartitionsMetadata();
        getRowGroupsMeta();
        getNonInterestingColumnsMetadata();
        this.tableMetadata = TableMetadataUtils.updateRowCount(tableMetadata, getRowGroupsMeta());
        parquetTableMetadata = null;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataType(org.apache.drill.metastore.metadata.MetadataType) LoggerFactory(org.slf4j.LoggerFactory) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) TupleSchema(org.apache.drill.exec.record.metadata.TupleSchema) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) SchemaProvider(org.apache.drill.exec.record.metadata.schema.SchemaProvider) HashMultimap(org.apache.drill.shaded.guava.com.google.common.collect.HashMultimap) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) Set(java.util.Set) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) MetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) TypeProtos(org.apache.drill.common.types.TypeProtos) List(java.util.List) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataProviderManager(org.apache.drill.exec.metastore.MetadataProviderManager) TableMetadataUtils(org.apache.drill.metastore.util.TableMetadataUtils) TableInfo(org.apache.drill.metastore.metadata.TableInfo) Table(org.apache.drill.shaded.guava.com.google.common.collect.Table) HashMap(java.util.HashMap) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) NonInterestingColumnsMetadata(org.apache.drill.metastore.metadata.NonInterestingColumnsMetadata) DrillStatsTable(org.apache.drill.exec.planner.common.DrillStatsTable) TableMetadata(org.apache.drill.metastore.metadata.TableMetadata) PARQUET_COLUMN_STATISTICS(org.apache.drill.exec.store.parquet.ParquetTableMetadataUtils.PARQUET_COLUMN_STATISTICS) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) SchemaPathUtils(org.apache.drill.metastore.util.SchemaPathUtils) Logger(org.slf4j.Logger) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) HashBasedTable(org.apache.drill.shaded.guava.com.google.common.collect.HashBasedTable) IOException(java.io.IOException) ParquetMetadataProvider(org.apache.drill.exec.metastore.store.parquet.ParquetMetadataProvider) LinkedListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.LinkedListMultimap) Statistic(org.apache.drill.metastore.statistics.Statistic) LocationProvider(org.apache.drill.metastore.metadata.LocationProvider) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) ParquetMetadataProviderBuilder(org.apache.drill.exec.metastore.store.parquet.ParquetMetadataProviderBuilder) Collections(java.util.Collections) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) TableMetadata(org.apache.drill.metastore.metadata.TableMetadata)

Example 2 with TableMetadata

use of org.apache.drill.metastore.metadata.TableMetadata in project drill by apache.

the class DrillRelMdDistinctRowCount method getDistinctRowCountInternal.

/**
 * Estimates the number of rows which would be produced by a GROUP BY on the
 * set of columns indicated by groupKey.
 * column").
 */
private Double getDistinctRowCountInternal(TableScan scan, RelMetadataQuery mq, DrillTable table, ImmutableBitSet groupKey, RelDataType type, RexNode predicate) {
    double selectivity, gbyColPredSel, rowCount;
    /* If predicate is present, determine its selectivity to estimate filtered rows.
     * Thereafter, compute the number of distinct rows.
     */
    selectivity = mq.getSelectivity(scan, predicate);
    rowCount = mq.getRowCount(scan);
    if (groupKey.length() == 0) {
        return selectivity * rowCount;
    }
    TableMetadata tableMetadata;
    try {
        tableMetadata = table.getGroupScan().getTableMetadata();
    } catch (IOException e) {
        // Statistics cannot be obtained, use default behaviour
        return scan.estimateRowCount(mq) * 0.1;
    }
    double estRowCnt = 1.0;
    String colName = "";
    boolean allColsHaveNDV = true;
    for (int i = 0; i < groupKey.length(); i++) {
        colName = type.getFieldNames().get(i);
        if (!groupKey.get(i)) {
            continue;
        }
        ColumnStatistics<?> columnStatistics = tableMetadata != null ? tableMetadata.getColumnStatistics(SchemaPath.getSimplePath(colName)) : null;
        Double ndv = columnStatistics != null ? ColumnStatisticsKind.NDV.getFrom(columnStatistics) : null;
        // Skip NDV, if not available
        if (ndv == null) {
            allColsHaveNDV = false;
            break;
        }
        estRowCnt *= ndv;
        gbyColPredSel = getPredSelectivityContainingInputRef(predicate, i, mq, scan);
        /* If predicate is on group-by column, scale down the NDV by selectivity. Consider the query
       * select a, b from t where a = 10 group by a, b. Here, NDV(a) will be scaled down by SEL(a)
       * whereas NDV(b) will not.
       */
        if (gbyColPredSel > 0) {
            estRowCnt *= gbyColPredSel;
        }
    }
    // Estimated NDV should not exceed number of rows after applying the filters
    estRowCnt = Math.min(estRowCnt, selectivity * rowCount);
    if (!allColsHaveNDV) {
        if (logger.isDebugEnabled()) {
            logger.debug(String.format("NDV not available for %s(%s). Using default rowcount for group-by %s", (tableMetadata != null ? tableMetadata.getTableInfo().name() : ""), colName, groupKey.toString()));
        }
        // Could not get any NDV estimate from stats - probably stats not present for GBY cols. So Guess!
        return scan.estimateRowCount(mq) * 0.1;
    } else {
        /* rowCount maybe less than NDV(different source), sanity check OR NDV not used at all */
        return estRowCnt;
    }
}
Also used : TableMetadata(org.apache.drill.metastore.metadata.TableMetadata) IOException(java.io.IOException)

Example 3 with TableMetadata

use of org.apache.drill.metastore.metadata.TableMetadata in project drill by apache.

the class DrillRelMdSelectivity method getScanSelectivity.

private Double getScanSelectivity(RelNode rel, RelMetadataQuery mq, RexNode predicate) {
    double ROWCOUNT_UNKNOWN = -1.0;
    GroupScan scan = null;
    PlannerSettings settings = PrelUtil.getPlannerSettings(rel.getCluster().getPlanner());
    final RexBuilder rexBuilder = rel.getCluster().getRexBuilder();
    if (rel instanceof DrillScanRel) {
        scan = ((DrillScanRel) rel).getGroupScan();
    } else if (rel instanceof ScanPrel) {
        scan = ((ScanPrel) rel).getGroupScan();
    }
    if (scan != null) {
        if (settings.isStatisticsEnabled() && scan instanceof DbGroupScan) {
            double filterRows = ((DbGroupScan) scan).getRowCount(predicate, rel);
            double totalRows = ((DbGroupScan) scan).getRowCount(null, rel);
            if (filterRows != ROWCOUNT_UNKNOWN && totalRows != ROWCOUNT_UNKNOWN && totalRows > 0) {
                return Math.min(1.0, filterRows / totalRows);
            }
        }
    }
    // Do not mess with statistics used for DBGroupScans.
    if (rel instanceof TableScan) {
        if (DrillRelOptUtil.guessRows(rel)) {
            return super.getSelectivity(rel, mq, predicate);
        }
        DrillTable table = Utilities.getDrillTable(rel.getTable());
        try {
            TableMetadata tableMetadata;
            if (table != null && (tableMetadata = table.getGroupScan().getTableMetadata()) != null && TableStatisticsKind.HAS_DESCRIPTIVE_STATISTICS.getValue(tableMetadata)) {
                List<SchemaPath> fieldNames;
                if (rel instanceof DrillScanRelBase) {
                    fieldNames = ((DrillScanRelBase) rel).getGroupScan().getColumns();
                } else {
                    fieldNames = rel.getRowType().getFieldNames().stream().map(SchemaPath::getSimplePath).collect(Collectors.toList());
                }
                return getScanSelectivityInternal(tableMetadata, predicate, fieldNames, rexBuilder);
            }
        } catch (IOException e) {
            super.getSelectivity(rel, mq, predicate);
        }
    }
    return super.getSelectivity(rel, mq, predicate);
}
Also used : TableMetadata(org.apache.drill.metastore.metadata.TableMetadata) TableScan(org.apache.calcite.rel.core.TableScan) DrillScanRel(org.apache.drill.exec.planner.logical.DrillScanRel) ScanPrel(org.apache.drill.exec.planner.physical.ScanPrel) DrillTable(org.apache.drill.exec.planner.logical.DrillTable) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) IOException(java.io.IOException) DbGroupScan(org.apache.drill.exec.physical.base.DbGroupScan) GroupScan(org.apache.drill.exec.physical.base.GroupScan) SchemaPath(org.apache.drill.common.expression.SchemaPath) DbGroupScan(org.apache.drill.exec.physical.base.DbGroupScan) DrillScanRelBase(org.apache.drill.exec.planner.common.DrillScanRelBase) RexBuilder(org.apache.calcite.rex.RexBuilder)

Example 4 with TableMetadata

use of org.apache.drill.metastore.metadata.TableMetadata in project drill by apache.

the class BaseParquetMetadataProvider method getPartitionsMetadata.

@Override
public List<PartitionMetadata> getPartitionsMetadata() {
    if (partitions == null) {
        partitions = new ArrayList<>();
        if (collectMetadata) {
            Table<SchemaPath, Object, List<FileMetadata>> colValFile = HashBasedTable.create();
            Collection<FileMetadata> filesMetadata = getFilesMetadataMap().values();
            partitionColumns = getParquetGroupScanStatistics().getPartitionColumns();
            for (FileMetadata fileMetadata : filesMetadata) {
                for (SchemaPath partitionColumn : partitionColumns) {
                    Object partitionValue = getParquetGroupScanStatistics().getPartitionValue(fileMetadata.getPath(), partitionColumn);
                    // Table cannot contain nulls
                    partitionValue = partitionValue == null ? NULL_VALUE : partitionValue;
                    List<FileMetadata> partitionFiles = colValFile.get(partitionColumn, partitionValue);
                    if (partitionFiles == null) {
                        partitionFiles = new ArrayList<>();
                        colValFile.put(partitionColumn, partitionValue, partitionFiles);
                    }
                    partitionFiles.add(fileMetadata);
                }
            }
            for (SchemaPath logicalExpressions : colValFile.rowKeySet()) {
                for (List<FileMetadata> partValues : colValFile.row(logicalExpressions).values()) {
                    partitions.add(ParquetTableMetadataUtils.getPartitionMetadata(logicalExpressions, partValues));
                }
            }
        } else {
            for (SchemaPath partitionColumn : getParquetGroupScanStatistics().getPartitionColumns()) {
                Map<Path, Object> partitionPaths = getParquetGroupScanStatistics().getPartitionPaths(partitionColumn);
                Multimap<Object, Path> partitionsForValue = HashMultimap.create();
                partitionPaths.forEach((path, value) -> partitionsForValue.put(value, path));
                partitionsForValue.asMap().forEach((partitionKey, value) -> {
                    Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
                    List<StatisticsHolder<?>> statistics = new ArrayList<>();
                    partitionKey = partitionKey == NULL_VALUE ? null : partitionKey;
                    statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MIN_VALUE));
                    statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MAX_VALUE));
                    statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
                    statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, TableStatisticsKind.ROW_COUNT));
                    columnsStatistics.put(partitionColumn, new ColumnStatistics<>(statistics, getParquetGroupScanStatistics().getTypeForColumn(partitionColumn).getMinorType()));
                    MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).build();
                    TableMetadata tableMetadata = getTableMetadata();
                    PartitionMetadata partitionMetadata = PartitionMetadata.builder().tableInfo(tableMetadata.getTableInfo()).metadataInfo(metadataInfo).column(partitionColumn).schema(tableMetadata.getSchema()).columnsStatistics(columnsStatistics).metadataStatistics(statistics).partitionValues(Collections.emptyList()).locations(new HashSet<>(value)).build();
                    partitions.add(partitionMetadata);
                });
            }
        }
    }
    return partitions;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) TableMetadata(org.apache.drill.metastore.metadata.TableMetadata) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) ArrayList(java.util.ArrayList) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) List(java.util.List) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet)

Aggregations

TableMetadata (org.apache.drill.metastore.metadata.TableMetadata)4 IOException (java.io.IOException)3 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 LinkedHashMap (java.util.LinkedHashMap)2 List (java.util.List)2 ReadEntryWithPath (org.apache.drill.exec.store.dfs.ReadEntryWithPath)2 BaseTableMetadata (org.apache.drill.metastore.metadata.BaseTableMetadata)2 FileMetadata (org.apache.drill.metastore.metadata.FileMetadata)2 Collection (java.util.Collection)1 Collections (java.util.Collections)1 Map (java.util.Map)1 Objects (java.util.Objects)1 Set (java.util.Set)1 Function (java.util.function.Function)1 Collectors (java.util.stream.Collectors)1 TableScan (org.apache.calcite.rel.core.TableScan)1 RexBuilder (org.apache.calcite.rex.RexBuilder)1