Search in sources :

Example 21 with MetadataInfo

use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.

the class ParquetTableMetadataUtils method getFileMetadata.

/**
 * Returns {@link FileMetadata} instance received by merging specified {@link RowGroupMetadata} list.
 *
 * @param rowGroups collection of {@link RowGroupMetadata} to be merged
 * @return {@link FileMetadata} instance
 */
public static FileMetadata getFileMetadata(Collection<RowGroupMetadata> rowGroups) {
    if (rowGroups.isEmpty()) {
        return null;
    }
    List<StatisticsHolder<?>> fileStatistics = new ArrayList<>();
    fileStatistics.add(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(rowGroups), TableStatisticsKind.ROW_COUNT));
    RowGroupMetadata rowGroupMetadata = rowGroups.iterator().next();
    TupleMetadata schema = rowGroupMetadata.getSchema();
    Set<SchemaPath> columns = rowGroupMetadata.getColumnsStatistics().keySet();
    MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.FILE).build();
    return FileMetadata.builder().tableInfo(rowGroupMetadata.getTableInfo()).metadataInfo(metadataInfo).path(rowGroupMetadata.getPath()).schema(schema).columnsStatistics(TableMetadataUtils.mergeColumnsStatistics(rowGroups, columns, PARQUET_COLUMN_STATISTICS)).metadataStatistics(fileStatistics).build();
}
Also used : StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) SchemaPath(org.apache.drill.common.expression.SchemaPath) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) ArrayList(java.util.ArrayList) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Example 22 with MetadataInfo

use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.

the class BaseParquetMetadataProvider method getPartitionsMetadata.

@Override
public List<PartitionMetadata> getPartitionsMetadata() {
    if (partitions == null) {
        partitions = new ArrayList<>();
        if (collectMetadata) {
            Table<SchemaPath, Object, List<FileMetadata>> colValFile = HashBasedTable.create();
            Collection<FileMetadata> filesMetadata = getFilesMetadataMap().values();
            partitionColumns = getParquetGroupScanStatistics().getPartitionColumns();
            for (FileMetadata fileMetadata : filesMetadata) {
                for (SchemaPath partitionColumn : partitionColumns) {
                    Object partitionValue = getParquetGroupScanStatistics().getPartitionValue(fileMetadata.getPath(), partitionColumn);
                    // Table cannot contain nulls
                    partitionValue = partitionValue == null ? NULL_VALUE : partitionValue;
                    List<FileMetadata> partitionFiles = colValFile.get(partitionColumn, partitionValue);
                    if (partitionFiles == null) {
                        partitionFiles = new ArrayList<>();
                        colValFile.put(partitionColumn, partitionValue, partitionFiles);
                    }
                    partitionFiles.add(fileMetadata);
                }
            }
            for (SchemaPath logicalExpressions : colValFile.rowKeySet()) {
                for (List<FileMetadata> partValues : colValFile.row(logicalExpressions).values()) {
                    partitions.add(ParquetTableMetadataUtils.getPartitionMetadata(logicalExpressions, partValues));
                }
            }
        } else {
            for (SchemaPath partitionColumn : getParquetGroupScanStatistics().getPartitionColumns()) {
                Map<Path, Object> partitionPaths = getParquetGroupScanStatistics().getPartitionPaths(partitionColumn);
                Multimap<Object, Path> partitionsForValue = HashMultimap.create();
                partitionPaths.forEach((path, value) -> partitionsForValue.put(value, path));
                partitionsForValue.asMap().forEach((partitionKey, value) -> {
                    Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
                    List<StatisticsHolder<?>> statistics = new ArrayList<>();
                    partitionKey = partitionKey == NULL_VALUE ? null : partitionKey;
                    statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MIN_VALUE));
                    statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MAX_VALUE));
                    statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
                    statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, TableStatisticsKind.ROW_COUNT));
                    columnsStatistics.put(partitionColumn, new ColumnStatistics<>(statistics, getParquetGroupScanStatistics().getTypeForColumn(partitionColumn).getMinorType()));
                    MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).build();
                    TableMetadata tableMetadata = getTableMetadata();
                    PartitionMetadata partitionMetadata = PartitionMetadata.builder().tableInfo(tableMetadata.getTableInfo()).metadataInfo(metadataInfo).column(partitionColumn).schema(tableMetadata.getSchema()).columnsStatistics(columnsStatistics).metadataStatistics(statistics).partitionValues(Collections.emptyList()).locations(new HashSet<>(value)).build();
                    partitions.add(partitionMetadata);
                });
            }
        }
    }
    return partitions;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) TableMetadata(org.apache.drill.metastore.metadata.TableMetadata) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) ArrayList(java.util.ArrayList) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) List(java.util.List) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet)

Example 23 with MetadataInfo

use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.

the class BaseParquetMetadataProvider method getTableMetadata.

@Override
public TableMetadata getTableMetadata() {
    if (tableMetadata == null) {
        List<StatisticsHolder<?>> tableStatistics = new ArrayList<>(DrillStatsTable.getEstimatedTableStats(statsTable));
        Map<SchemaPath, TypeProtos.MajorType> fields = ParquetTableMetadataUtils.resolveFields(parquetTableMetadata);
        Map<SchemaPath, TypeProtos.MajorType> intermediateFields = ParquetTableMetadataUtils.resolveIntermediateFields(parquetTableMetadata);
        if (this.schema == null) {
            schema = new TupleSchema();
            fields.forEach((schemaPath, majorType) -> SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateFields));
        } else {
            // merges specified schema with schema from table
            fields.forEach((schemaPath, majorType) -> {
                if (SchemaPathUtils.getColumnMetadata(schemaPath, schema) == null) {
                    SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateFields);
                }
            });
        }
        Map<SchemaPath, ColumnStatistics<?>> columnsStatistics;
        if (collectMetadata) {
            Collection<? extends BaseMetadata> metadata = getFilesMetadataMap().values();
            if (metadata.isEmpty()) {
                metadata = getRowGroupsMeta();
            }
            tableStatistics.add(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(metadata), TableStatisticsKind.ROW_COUNT));
            columnsStatistics = TableMetadataUtils.mergeColumnsStatistics(metadata, fields.keySet(), PARQUET_COLUMN_STATISTICS);
        } else {
            columnsStatistics = new HashMap<>();
            tableStatistics.add(new StatisticsHolder<>(getParquetGroupScanStatistics().getRowCount(), TableStatisticsKind.ROW_COUNT));
            Set<SchemaPath> unhandledColumns = new HashSet<>();
            if (statsTable != null && statsTable.isMaterialized()) {
                unhandledColumns.addAll(statsTable.getColumns());
            }
            fields.forEach((columnPath, value) -> {
                long columnValueCount = getParquetGroupScanStatistics().getColumnValueCount(columnPath);
                // Adds statistics values itself if statistics is available
                List<StatisticsHolder<?>> stats = new ArrayList<>(DrillStatsTable.getEstimatedColumnStats(statsTable, columnPath));
                unhandledColumns.remove(columnPath);
                // adds statistics for partition columns
                stats.add(new StatisticsHolder<>(columnValueCount, TableStatisticsKind.ROW_COUNT));
                stats.add(new StatisticsHolder<>(getParquetGroupScanStatistics().getRowCount() - columnValueCount, ColumnStatisticsKind.NULLS_COUNT));
                columnsStatistics.put(columnPath, new ColumnStatistics<>(stats, value.getMinorType()));
            });
            for (SchemaPath column : unhandledColumns) {
                columnsStatistics.put(column, new ColumnStatistics<>(DrillStatsTable.getEstimatedColumnStats(statsTable, column)));
            }
        }
        MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.TABLE).build();
        tableMetadata = BaseTableMetadata.builder().tableInfo(TableInfo.UNKNOWN_TABLE_INFO).metadataInfo(metadataInfo).location(tableLocation).schema(schema).columnsStatistics(columnsStatistics).metadataStatistics(tableStatistics).partitionKeys(Collections.emptyMap()).build();
    }
    return tableMetadata;
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) ArrayList(java.util.ArrayList) TupleSchema(org.apache.drill.exec.record.metadata.TupleSchema) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath) HashSet(java.util.HashSet)

Aggregations

MetadataInfo (org.apache.drill.metastore.metadata.MetadataInfo)23 SchemaPath (org.apache.drill.common.expression.SchemaPath)21 StatisticsHolder (org.apache.drill.metastore.statistics.StatisticsHolder)16 Path (org.apache.hadoop.fs.Path)16 TableInfo (org.apache.drill.metastore.metadata.TableInfo)15 ColumnStatistics (org.apache.drill.metastore.statistics.ColumnStatistics)14 HashMap (java.util.HashMap)13 HashSet (java.util.HashSet)13 List (java.util.List)13 BaseTableMetadata (org.apache.drill.metastore.metadata.BaseTableMetadata)13 FileMetadata (org.apache.drill.metastore.metadata.FileMetadata)13 ArrayList (java.util.ArrayList)12 Collections (java.util.Collections)12 Collectors (java.util.stream.Collectors)12 TupleMetadata (org.apache.drill.exec.record.metadata.TupleMetadata)12 MetadataType (org.apache.drill.metastore.metadata.MetadataType)12 Map (java.util.Map)11 Set (java.util.Set)10 TypeProtos (org.apache.drill.common.types.TypeProtos)10 PlannerSettings (org.apache.drill.exec.planner.physical.PlannerSettings)10