Search in sources :

Example 6 with PartitionMetadata

use of org.apache.drill.metastore.metadata.PartitionMetadata in project drill by apache.

the class BaseParquetMetadataProvider method getPartitionsMetadata.

@Override
public List<PartitionMetadata> getPartitionsMetadata() {
    if (partitions == null) {
        partitions = new ArrayList<>();
        if (collectMetadata) {
            Table<SchemaPath, Object, List<FileMetadata>> colValFile = HashBasedTable.create();
            Collection<FileMetadata> filesMetadata = getFilesMetadataMap().values();
            partitionColumns = getParquetGroupScanStatistics().getPartitionColumns();
            for (FileMetadata fileMetadata : filesMetadata) {
                for (SchemaPath partitionColumn : partitionColumns) {
                    Object partitionValue = getParquetGroupScanStatistics().getPartitionValue(fileMetadata.getPath(), partitionColumn);
                    // Table cannot contain nulls
                    partitionValue = partitionValue == null ? NULL_VALUE : partitionValue;
                    List<FileMetadata> partitionFiles = colValFile.get(partitionColumn, partitionValue);
                    if (partitionFiles == null) {
                        partitionFiles = new ArrayList<>();
                        colValFile.put(partitionColumn, partitionValue, partitionFiles);
                    }
                    partitionFiles.add(fileMetadata);
                }
            }
            for (SchemaPath logicalExpressions : colValFile.rowKeySet()) {
                for (List<FileMetadata> partValues : colValFile.row(logicalExpressions).values()) {
                    partitions.add(ParquetTableMetadataUtils.getPartitionMetadata(logicalExpressions, partValues));
                }
            }
        } else {
            for (SchemaPath partitionColumn : getParquetGroupScanStatistics().getPartitionColumns()) {
                Map<Path, Object> partitionPaths = getParquetGroupScanStatistics().getPartitionPaths(partitionColumn);
                Multimap<Object, Path> partitionsForValue = HashMultimap.create();
                partitionPaths.forEach((path, value) -> partitionsForValue.put(value, path));
                partitionsForValue.asMap().forEach((partitionKey, value) -> {
                    Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
                    List<StatisticsHolder<?>> statistics = new ArrayList<>();
                    partitionKey = partitionKey == NULL_VALUE ? null : partitionKey;
                    statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MIN_VALUE));
                    statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MAX_VALUE));
                    statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
                    statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, TableStatisticsKind.ROW_COUNT));
                    columnsStatistics.put(partitionColumn, new ColumnStatistics<>(statistics, getParquetGroupScanStatistics().getTypeForColumn(partitionColumn).getMinorType()));
                    MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).build();
                    TableMetadata tableMetadata = getTableMetadata();
                    PartitionMetadata partitionMetadata = PartitionMetadata.builder().tableInfo(tableMetadata.getTableInfo()).metadataInfo(metadataInfo).column(partitionColumn).schema(tableMetadata.getSchema()).columnsStatistics(columnsStatistics).metadataStatistics(statistics).partitionValues(Collections.emptyList()).locations(new HashSet<>(value)).build();
                    partitions.add(partitionMetadata);
                });
            }
        }
    }
    return partitions;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) TableMetadata(org.apache.drill.metastore.metadata.TableMetadata) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) ArrayList(java.util.ArrayList) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) List(java.util.List) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet)

Aggregations

PartitionMetadata (org.apache.drill.metastore.metadata.PartitionMetadata)6 SchemaPath (org.apache.drill.common.expression.SchemaPath)5 FileMetadata (org.apache.drill.metastore.metadata.FileMetadata)5 SegmentMetadata (org.apache.drill.metastore.metadata.SegmentMetadata)5 Path (org.apache.hadoop.fs.Path)5 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 HashSet (java.util.HashSet)4 List (java.util.List)4 BaseTableMetadata (org.apache.drill.metastore.metadata.BaseTableMetadata)4 MetadataType (org.apache.drill.metastore.metadata.MetadataType)4 RowGroupMetadata (org.apache.drill.metastore.metadata.RowGroupMetadata)4 Collections (java.util.Collections)3 Map (java.util.Map)3 Set (java.util.Set)3 Collectors (java.util.stream.Collectors)3 BaseMetadata (org.apache.drill.metastore.metadata.BaseMetadata)3 MetadataInfo (org.apache.drill.metastore.metadata.MetadataInfo)3 TableInfo (org.apache.drill.metastore.metadata.TableInfo)3 ColumnStatistics (org.apache.drill.metastore.statistics.ColumnStatistics)3