Search in sources :

Example 36 with ColumnStatistics

use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.

the class ParquetGroupScanStatistics method collect.

public void collect(Collection<T> metadataList) {
    resetHolders();
    boolean first = true;
    for (T metadata : metadataList) {
        long localRowCount = TableStatisticsKind.ROW_COUNT.getValue(metadata);
        for (Map.Entry<SchemaPath, ColumnStatistics<?>> columnsStatistics : metadata.getColumnsStatistics().entrySet()) {
            SchemaPath schemaPath = columnsStatistics.getKey();
            ColumnStatistics<?> statistics = columnsStatistics.getValue();
            MutableLong emptyCount = new MutableLong();
            MutableLong previousCount = columnValueCounts.putIfAbsent(schemaPath, emptyCount);
            if (previousCount == null) {
                previousCount = emptyCount;
            }
            Long nullsNum = ColumnStatisticsKind.NULLS_COUNT.getFrom(statistics);
            if (previousCount.longValue() != Statistic.NO_COLUMN_STATS && nullsNum != null && nullsNum != Statistic.NO_COLUMN_STATS) {
                previousCount.add(localRowCount - nullsNum);
            } else {
                previousCount.setValue(Statistic.NO_COLUMN_STATS);
            }
            ColumnMetadata columnMetadata = SchemaPathUtils.getColumnMetadata(schemaPath, metadata.getSchema());
            // DRILL-7934
            // base on metastore/metastore-api/src/main/java/org/apache/drill/metastore/util/SchemaPathUtils.java#145
            // list schema is skipped, so that in this class drill can not get majorType by schemaPath.
            // we can change null type to return false to avoid NullPointerException
            TypeProtos.MajorType majorType = columnMetadata != null ? columnMetadata.majorType() : null;
            boolean partitionColumn = majorType != null && checkForPartitionColumn(statistics, first, localRowCount, majorType, schemaPath);
            if (partitionColumn) {
                Object value = partitionValueMap.get(metadata.getPath(), schemaPath);
                Object currentValue = ColumnStatisticsKind.MAX_VALUE.getFrom(statistics);
                if (value != null && value != BaseParquetMetadataProvider.NULL_VALUE) {
                    if (value != currentValue) {
                        partitionColTypeMap.remove(schemaPath);
                    }
                } else {
                    // so checks that there are really null value and puts it to the map
                    if (localRowCount == ColumnStatisticsKind.NULLS_COUNT.getFrom(statistics)) {
                        partitionValueMap.put(metadata.getPath(), schemaPath, BaseParquetMetadataProvider.NULL_VALUE);
                    } else {
                        partitionValueMap.put(metadata.getPath(), schemaPath, currentValue);
                    }
                }
            } else {
                partitionColTypeMap.remove(schemaPath);
            }
        }
        this.rowCount += localRowCount;
        first = false;
    }
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) TypeProtos(org.apache.drill.common.types.TypeProtos) MutableLong(org.apache.commons.lang3.mutable.MutableLong) SchemaPath(org.apache.drill.common.expression.SchemaPath) MutableLong(org.apache.commons.lang3.mutable.MutableLong) HashMap(java.util.HashMap) Map(java.util.Map)

Example 37 with ColumnStatistics

use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.

the class ParquetTableMetadataUtils method getRowGroupColumnStatistics.

/**
 * Converts specified {@link MetadataBase.RowGroupMetadata} into the map of {@link ColumnStatistics}
 * instances with column names as keys.
 *
 * @param tableMetadata    the source of column types
 * @param rowGroupMetadata metadata to convert
 * @return map with converted row group metadata
 */
public static Map<SchemaPath, ColumnStatistics<?>> getRowGroupColumnStatistics(MetadataBase.ParquetTableMetadataBase tableMetadata, MetadataBase.RowGroupMetadata rowGroupMetadata) {
    Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
    for (MetadataBase.ColumnMetadata column : rowGroupMetadata.getColumns()) {
        SchemaPath colPath = SchemaPath.getCompoundPath(column.getName());
        Long nulls = column.getNulls();
        if (hasInvalidStatistics(column, tableMetadata)) {
            nulls = Statistic.NO_COLUMN_STATS;
        }
        PrimitiveType.PrimitiveTypeName primitiveType = getPrimitiveTypeName(tableMetadata, column);
        OriginalType originalType = getOriginalType(tableMetadata, column);
        TypeProtos.MinorType type = ParquetReaderUtility.getMinorType(primitiveType, originalType);
        List<StatisticsHolder<?>> statistics = new ArrayList<>();
        statistics.add(new StatisticsHolder<>(getValue(column.getMinValue(), primitiveType, originalType), ColumnStatisticsKind.MIN_VALUE));
        statistics.add(new StatisticsHolder<>(getValue(column.getMaxValue(), primitiveType, originalType), ColumnStatisticsKind.MAX_VALUE));
        statistics.add(new StatisticsHolder<>(nulls, ColumnStatisticsKind.NULLS_COUNT));
        columnsStatistics.put(colPath, new ColumnStatistics<>(statistics, type));
    }
    return columnsStatistics;
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) TypeProtos(org.apache.drill.common.types.TypeProtos) OriginalType(org.apache.parquet.schema.OriginalType) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 38 with ColumnStatistics

use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.

the class ParquetTableMetadataUtils method getNonInterestingColumnsMeta.

/**
 * Returns the non-interesting column's metadata
 * @param parquetTableMetadata the source of column metadata for non-interesting column's statistics
 * @return returns non-interesting columns metadata
 */
public static NonInterestingColumnsMetadata getNonInterestingColumnsMeta(MetadataBase.ParquetTableMetadataBase parquetTableMetadata) {
    Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
    if (parquetTableMetadata instanceof Metadata_V4.ParquetTableMetadata_v4) {
        Map<Metadata_V4.ColumnTypeMetadata_v4.Key, Metadata_V4.ColumnTypeMetadata_v4> columnTypeInfoMap = ((Metadata_V4.ParquetTableMetadata_v4) parquetTableMetadata).getColumnTypeInfoMap();
        if (columnTypeInfoMap == null) {
            return new NonInterestingColumnsMetadata(columnsStatistics);
        }
        for (Metadata_V4.ColumnTypeMetadata_v4 columnTypeMetadata : columnTypeInfoMap.values()) {
            if (!columnTypeMetadata.isInteresting) {
                SchemaPath schemaPath = SchemaPath.getCompoundPath(columnTypeMetadata.name);
                List<StatisticsHolder<?>> statistics = new ArrayList<>();
                statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
                PrimitiveType.PrimitiveTypeName primitiveType = columnTypeMetadata.primitiveType;
                OriginalType originalType = columnTypeMetadata.originalType;
                TypeProtos.MinorType type = ParquetReaderUtility.getMinorType(primitiveType, originalType);
                columnsStatistics.put(schemaPath, new ColumnStatistics<>(statistics, type));
            }
        }
        return new NonInterestingColumnsMetadata(columnsStatistics);
    }
    return new NonInterestingColumnsMetadata(columnsStatistics);
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) NonInterestingColumnsMetadata(org.apache.drill.metastore.metadata.NonInterestingColumnsMetadata) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) TypeProtos(org.apache.drill.common.types.TypeProtos) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) OriginalType(org.apache.parquet.schema.OriginalType) Metadata_V4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4) SchemaPath(org.apache.drill.common.expression.SchemaPath) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 39 with ColumnStatistics

use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.

the class BaseParquetMetadataProvider method getPartitionsMetadata.

@Override
public List<PartitionMetadata> getPartitionsMetadata() {
    if (partitions == null) {
        partitions = new ArrayList<>();
        if (collectMetadata) {
            Table<SchemaPath, Object, List<FileMetadata>> colValFile = HashBasedTable.create();
            Collection<FileMetadata> filesMetadata = getFilesMetadataMap().values();
            partitionColumns = getParquetGroupScanStatistics().getPartitionColumns();
            for (FileMetadata fileMetadata : filesMetadata) {
                for (SchemaPath partitionColumn : partitionColumns) {
                    Object partitionValue = getParquetGroupScanStatistics().getPartitionValue(fileMetadata.getPath(), partitionColumn);
                    // Table cannot contain nulls
                    partitionValue = partitionValue == null ? NULL_VALUE : partitionValue;
                    List<FileMetadata> partitionFiles = colValFile.get(partitionColumn, partitionValue);
                    if (partitionFiles == null) {
                        partitionFiles = new ArrayList<>();
                        colValFile.put(partitionColumn, partitionValue, partitionFiles);
                    }
                    partitionFiles.add(fileMetadata);
                }
            }
            for (SchemaPath logicalExpressions : colValFile.rowKeySet()) {
                for (List<FileMetadata> partValues : colValFile.row(logicalExpressions).values()) {
                    partitions.add(ParquetTableMetadataUtils.getPartitionMetadata(logicalExpressions, partValues));
                }
            }
        } else {
            for (SchemaPath partitionColumn : getParquetGroupScanStatistics().getPartitionColumns()) {
                Map<Path, Object> partitionPaths = getParquetGroupScanStatistics().getPartitionPaths(partitionColumn);
                Multimap<Object, Path> partitionsForValue = HashMultimap.create();
                partitionPaths.forEach((path, value) -> partitionsForValue.put(value, path));
                partitionsForValue.asMap().forEach((partitionKey, value) -> {
                    Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
                    List<StatisticsHolder<?>> statistics = new ArrayList<>();
                    partitionKey = partitionKey == NULL_VALUE ? null : partitionKey;
                    statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MIN_VALUE));
                    statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MAX_VALUE));
                    statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
                    statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, TableStatisticsKind.ROW_COUNT));
                    columnsStatistics.put(partitionColumn, new ColumnStatistics<>(statistics, getParquetGroupScanStatistics().getTypeForColumn(partitionColumn).getMinorType()));
                    MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).build();
                    TableMetadata tableMetadata = getTableMetadata();
                    PartitionMetadata partitionMetadata = PartitionMetadata.builder().tableInfo(tableMetadata.getTableInfo()).metadataInfo(metadataInfo).column(partitionColumn).schema(tableMetadata.getSchema()).columnsStatistics(columnsStatistics).metadataStatistics(statistics).partitionValues(Collections.emptyList()).locations(new HashSet<>(value)).build();
                    partitions.add(partitionMetadata);
                });
            }
        }
    }
    return partitions;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) TableMetadata(org.apache.drill.metastore.metadata.TableMetadata) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) ArrayList(java.util.ArrayList) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) List(java.util.List) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet)

Example 40 with ColumnStatistics

use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.

the class BaseParquetMetadataProvider method getTableMetadata.

@Override
public TableMetadata getTableMetadata() {
    if (tableMetadata == null) {
        List<StatisticsHolder<?>> tableStatistics = new ArrayList<>(DrillStatsTable.getEstimatedTableStats(statsTable));
        Map<SchemaPath, TypeProtos.MajorType> fields = ParquetTableMetadataUtils.resolveFields(parquetTableMetadata);
        Map<SchemaPath, TypeProtos.MajorType> intermediateFields = ParquetTableMetadataUtils.resolveIntermediateFields(parquetTableMetadata);
        if (this.schema == null) {
            schema = new TupleSchema();
            fields.forEach((schemaPath, majorType) -> SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateFields));
        } else {
            // merges specified schema with schema from table
            fields.forEach((schemaPath, majorType) -> {
                if (SchemaPathUtils.getColumnMetadata(schemaPath, schema) == null) {
                    SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateFields);
                }
            });
        }
        Map<SchemaPath, ColumnStatistics<?>> columnsStatistics;
        if (collectMetadata) {
            Collection<? extends BaseMetadata> metadata = getFilesMetadataMap().values();
            if (metadata.isEmpty()) {
                metadata = getRowGroupsMeta();
            }
            tableStatistics.add(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(metadata), TableStatisticsKind.ROW_COUNT));
            columnsStatistics = TableMetadataUtils.mergeColumnsStatistics(metadata, fields.keySet(), PARQUET_COLUMN_STATISTICS);
        } else {
            columnsStatistics = new HashMap<>();
            tableStatistics.add(new StatisticsHolder<>(getParquetGroupScanStatistics().getRowCount(), TableStatisticsKind.ROW_COUNT));
            Set<SchemaPath> unhandledColumns = new HashSet<>();
            if (statsTable != null && statsTable.isMaterialized()) {
                unhandledColumns.addAll(statsTable.getColumns());
            }
            fields.forEach((columnPath, value) -> {
                long columnValueCount = getParquetGroupScanStatistics().getColumnValueCount(columnPath);
                // Adds statistics values itself if statistics is available
                List<StatisticsHolder<?>> stats = new ArrayList<>(DrillStatsTable.getEstimatedColumnStats(statsTable, columnPath));
                unhandledColumns.remove(columnPath);
                // adds statistics for partition columns
                stats.add(new StatisticsHolder<>(columnValueCount, TableStatisticsKind.ROW_COUNT));
                stats.add(new StatisticsHolder<>(getParquetGroupScanStatistics().getRowCount() - columnValueCount, ColumnStatisticsKind.NULLS_COUNT));
                columnsStatistics.put(columnPath, new ColumnStatistics<>(stats, value.getMinorType()));
            });
            for (SchemaPath column : unhandledColumns) {
                columnsStatistics.put(column, new ColumnStatistics<>(DrillStatsTable.getEstimatedColumnStats(statsTable, column)));
            }
        }
        MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.TABLE).build();
        tableMetadata = BaseTableMetadata.builder().tableInfo(TableInfo.UNKNOWN_TABLE_INFO).metadataInfo(metadataInfo).location(tableLocation).schema(schema).columnsStatistics(columnsStatistics).metadataStatistics(tableStatistics).partitionKeys(Collections.emptyMap()).build();
    }
    return tableMetadata;
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) ArrayList(java.util.ArrayList) TupleSchema(org.apache.drill.exec.record.metadata.TupleSchema) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath) HashSet(java.util.HashSet)

Aggregations

ColumnStatistics (org.apache.drill.metastore.statistics.ColumnStatistics)40 SchemaPath (org.apache.drill.common.expression.SchemaPath)39 Path (org.apache.hadoop.fs.Path)30 BaseTableMetadata (org.apache.drill.metastore.metadata.BaseTableMetadata)29 StatisticsHolder (org.apache.drill.metastore.statistics.StatisticsHolder)27 MetastoreTableInfo (org.apache.drill.metastore.components.tables.MetastoreTableInfo)26 TableInfo (org.apache.drill.metastore.metadata.TableInfo)26 HashMap (java.util.HashMap)25 MetastoreTest (org.apache.drill.categories.MetastoreTest)21 ClusterTest (org.apache.drill.test.ClusterTest)21 Test (org.junit.Test)21 File (java.io.File)20 SlowTest (org.apache.drill.categories.SlowTest)20 TupleMetadata (org.apache.drill.exec.record.metadata.TupleMetadata)20 FileMetadata (org.apache.drill.metastore.metadata.FileMetadata)17 ArrayList (java.util.ArrayList)15 SegmentMetadata (org.apache.drill.metastore.metadata.SegmentMetadata)14 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)14 RowGroupMetadata (org.apache.drill.metastore.metadata.RowGroupMetadata)12 Map (java.util.Map)11