use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class ParquetGroupScanStatistics method collect.
public void collect(Collection<T> metadataList) {
resetHolders();
boolean first = true;
for (T metadata : metadataList) {
long localRowCount = TableStatisticsKind.ROW_COUNT.getValue(metadata);
for (Map.Entry<SchemaPath, ColumnStatistics<?>> columnsStatistics : metadata.getColumnsStatistics().entrySet()) {
SchemaPath schemaPath = columnsStatistics.getKey();
ColumnStatistics<?> statistics = columnsStatistics.getValue();
MutableLong emptyCount = new MutableLong();
MutableLong previousCount = columnValueCounts.putIfAbsent(schemaPath, emptyCount);
if (previousCount == null) {
previousCount = emptyCount;
}
Long nullsNum = ColumnStatisticsKind.NULLS_COUNT.getFrom(statistics);
if (previousCount.longValue() != Statistic.NO_COLUMN_STATS && nullsNum != null && nullsNum != Statistic.NO_COLUMN_STATS) {
previousCount.add(localRowCount - nullsNum);
} else {
previousCount.setValue(Statistic.NO_COLUMN_STATS);
}
ColumnMetadata columnMetadata = SchemaPathUtils.getColumnMetadata(schemaPath, metadata.getSchema());
// DRILL-7934
// base on metastore/metastore-api/src/main/java/org/apache/drill/metastore/util/SchemaPathUtils.java#145
// list schema is skipped, so that in this class drill can not get majorType by schemaPath.
// we can change null type to return false to avoid NullPointerException
TypeProtos.MajorType majorType = columnMetadata != null ? columnMetadata.majorType() : null;
boolean partitionColumn = majorType != null && checkForPartitionColumn(statistics, first, localRowCount, majorType, schemaPath);
if (partitionColumn) {
Object value = partitionValueMap.get(metadata.getPath(), schemaPath);
Object currentValue = ColumnStatisticsKind.MAX_VALUE.getFrom(statistics);
if (value != null && value != BaseParquetMetadataProvider.NULL_VALUE) {
if (value != currentValue) {
partitionColTypeMap.remove(schemaPath);
}
} else {
// so checks that there are really null value and puts it to the map
if (localRowCount == ColumnStatisticsKind.NULLS_COUNT.getFrom(statistics)) {
partitionValueMap.put(metadata.getPath(), schemaPath, BaseParquetMetadataProvider.NULL_VALUE);
} else {
partitionValueMap.put(metadata.getPath(), schemaPath, currentValue);
}
}
} else {
partitionColTypeMap.remove(schemaPath);
}
}
this.rowCount += localRowCount;
first = false;
}
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class ParquetTableMetadataUtils method getRowGroupColumnStatistics.
/**
* Converts specified {@link MetadataBase.RowGroupMetadata} into the map of {@link ColumnStatistics}
* instances with column names as keys.
*
* @param tableMetadata the source of column types
* @param rowGroupMetadata metadata to convert
* @return map with converted row group metadata
*/
public static Map<SchemaPath, ColumnStatistics<?>> getRowGroupColumnStatistics(MetadataBase.ParquetTableMetadataBase tableMetadata, MetadataBase.RowGroupMetadata rowGroupMetadata) {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
for (MetadataBase.ColumnMetadata column : rowGroupMetadata.getColumns()) {
SchemaPath colPath = SchemaPath.getCompoundPath(column.getName());
Long nulls = column.getNulls();
if (hasInvalidStatistics(column, tableMetadata)) {
nulls = Statistic.NO_COLUMN_STATS;
}
PrimitiveType.PrimitiveTypeName primitiveType = getPrimitiveTypeName(tableMetadata, column);
OriginalType originalType = getOriginalType(tableMetadata, column);
TypeProtos.MinorType type = ParquetReaderUtility.getMinorType(primitiveType, originalType);
List<StatisticsHolder<?>> statistics = new ArrayList<>();
statistics.add(new StatisticsHolder<>(getValue(column.getMinValue(), primitiveType, originalType), ColumnStatisticsKind.MIN_VALUE));
statistics.add(new StatisticsHolder<>(getValue(column.getMaxValue(), primitiveType, originalType), ColumnStatisticsKind.MAX_VALUE));
statistics.add(new StatisticsHolder<>(nulls, ColumnStatisticsKind.NULLS_COUNT));
columnsStatistics.put(colPath, new ColumnStatistics<>(statistics, type));
}
return columnsStatistics;
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class ParquetTableMetadataUtils method getNonInterestingColumnsMeta.
/**
* Returns the non-interesting column's metadata
* @param parquetTableMetadata the source of column metadata for non-interesting column's statistics
* @return returns non-interesting columns metadata
*/
public static NonInterestingColumnsMetadata getNonInterestingColumnsMeta(MetadataBase.ParquetTableMetadataBase parquetTableMetadata) {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
if (parquetTableMetadata instanceof Metadata_V4.ParquetTableMetadata_v4) {
Map<Metadata_V4.ColumnTypeMetadata_v4.Key, Metadata_V4.ColumnTypeMetadata_v4> columnTypeInfoMap = ((Metadata_V4.ParquetTableMetadata_v4) parquetTableMetadata).getColumnTypeInfoMap();
if (columnTypeInfoMap == null) {
return new NonInterestingColumnsMetadata(columnsStatistics);
}
for (Metadata_V4.ColumnTypeMetadata_v4 columnTypeMetadata : columnTypeInfoMap.values()) {
if (!columnTypeMetadata.isInteresting) {
SchemaPath schemaPath = SchemaPath.getCompoundPath(columnTypeMetadata.name);
List<StatisticsHolder<?>> statistics = new ArrayList<>();
statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
PrimitiveType.PrimitiveTypeName primitiveType = columnTypeMetadata.primitiveType;
OriginalType originalType = columnTypeMetadata.originalType;
TypeProtos.MinorType type = ParquetReaderUtility.getMinorType(primitiveType, originalType);
columnsStatistics.put(schemaPath, new ColumnStatistics<>(statistics, type));
}
}
return new NonInterestingColumnsMetadata(columnsStatistics);
}
return new NonInterestingColumnsMetadata(columnsStatistics);
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class BaseParquetMetadataProvider method getPartitionsMetadata.
@Override
public List<PartitionMetadata> getPartitionsMetadata() {
if (partitions == null) {
partitions = new ArrayList<>();
if (collectMetadata) {
Table<SchemaPath, Object, List<FileMetadata>> colValFile = HashBasedTable.create();
Collection<FileMetadata> filesMetadata = getFilesMetadataMap().values();
partitionColumns = getParquetGroupScanStatistics().getPartitionColumns();
for (FileMetadata fileMetadata : filesMetadata) {
for (SchemaPath partitionColumn : partitionColumns) {
Object partitionValue = getParquetGroupScanStatistics().getPartitionValue(fileMetadata.getPath(), partitionColumn);
// Table cannot contain nulls
partitionValue = partitionValue == null ? NULL_VALUE : partitionValue;
List<FileMetadata> partitionFiles = colValFile.get(partitionColumn, partitionValue);
if (partitionFiles == null) {
partitionFiles = new ArrayList<>();
colValFile.put(partitionColumn, partitionValue, partitionFiles);
}
partitionFiles.add(fileMetadata);
}
}
for (SchemaPath logicalExpressions : colValFile.rowKeySet()) {
for (List<FileMetadata> partValues : colValFile.row(logicalExpressions).values()) {
partitions.add(ParquetTableMetadataUtils.getPartitionMetadata(logicalExpressions, partValues));
}
}
} else {
for (SchemaPath partitionColumn : getParquetGroupScanStatistics().getPartitionColumns()) {
Map<Path, Object> partitionPaths = getParquetGroupScanStatistics().getPartitionPaths(partitionColumn);
Multimap<Object, Path> partitionsForValue = HashMultimap.create();
partitionPaths.forEach((path, value) -> partitionsForValue.put(value, path));
partitionsForValue.asMap().forEach((partitionKey, value) -> {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
List<StatisticsHolder<?>> statistics = new ArrayList<>();
partitionKey = partitionKey == NULL_VALUE ? null : partitionKey;
statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MIN_VALUE));
statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MAX_VALUE));
statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, TableStatisticsKind.ROW_COUNT));
columnsStatistics.put(partitionColumn, new ColumnStatistics<>(statistics, getParquetGroupScanStatistics().getTypeForColumn(partitionColumn).getMinorType()));
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).build();
TableMetadata tableMetadata = getTableMetadata();
PartitionMetadata partitionMetadata = PartitionMetadata.builder().tableInfo(tableMetadata.getTableInfo()).metadataInfo(metadataInfo).column(partitionColumn).schema(tableMetadata.getSchema()).columnsStatistics(columnsStatistics).metadataStatistics(statistics).partitionValues(Collections.emptyList()).locations(new HashSet<>(value)).build();
partitions.add(partitionMetadata);
});
}
}
}
return partitions;
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class BaseParquetMetadataProvider method getTableMetadata.
@Override
public TableMetadata getTableMetadata() {
if (tableMetadata == null) {
List<StatisticsHolder<?>> tableStatistics = new ArrayList<>(DrillStatsTable.getEstimatedTableStats(statsTable));
Map<SchemaPath, TypeProtos.MajorType> fields = ParquetTableMetadataUtils.resolveFields(parquetTableMetadata);
Map<SchemaPath, TypeProtos.MajorType> intermediateFields = ParquetTableMetadataUtils.resolveIntermediateFields(parquetTableMetadata);
if (this.schema == null) {
schema = new TupleSchema();
fields.forEach((schemaPath, majorType) -> SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateFields));
} else {
// merges specified schema with schema from table
fields.forEach((schemaPath, majorType) -> {
if (SchemaPathUtils.getColumnMetadata(schemaPath, schema) == null) {
SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateFields);
}
});
}
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics;
if (collectMetadata) {
Collection<? extends BaseMetadata> metadata = getFilesMetadataMap().values();
if (metadata.isEmpty()) {
metadata = getRowGroupsMeta();
}
tableStatistics.add(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(metadata), TableStatisticsKind.ROW_COUNT));
columnsStatistics = TableMetadataUtils.mergeColumnsStatistics(metadata, fields.keySet(), PARQUET_COLUMN_STATISTICS);
} else {
columnsStatistics = new HashMap<>();
tableStatistics.add(new StatisticsHolder<>(getParquetGroupScanStatistics().getRowCount(), TableStatisticsKind.ROW_COUNT));
Set<SchemaPath> unhandledColumns = new HashSet<>();
if (statsTable != null && statsTable.isMaterialized()) {
unhandledColumns.addAll(statsTable.getColumns());
}
fields.forEach((columnPath, value) -> {
long columnValueCount = getParquetGroupScanStatistics().getColumnValueCount(columnPath);
// Adds statistics values itself if statistics is available
List<StatisticsHolder<?>> stats = new ArrayList<>(DrillStatsTable.getEstimatedColumnStats(statsTable, columnPath));
unhandledColumns.remove(columnPath);
// adds statistics for partition columns
stats.add(new StatisticsHolder<>(columnValueCount, TableStatisticsKind.ROW_COUNT));
stats.add(new StatisticsHolder<>(getParquetGroupScanStatistics().getRowCount() - columnValueCount, ColumnStatisticsKind.NULLS_COUNT));
columnsStatistics.put(columnPath, new ColumnStatistics<>(stats, value.getMinorType()));
});
for (SchemaPath column : unhandledColumns) {
columnsStatistics.put(column, new ColumnStatistics<>(DrillStatsTable.getEstimatedColumnStats(statsTable, column)));
}
}
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.TABLE).build();
tableMetadata = BaseTableMetadata.builder().tableInfo(TableInfo.UNKNOWN_TABLE_INFO).metadataInfo(metadataInfo).location(tableLocation).schema(schema).columnsStatistics(columnsStatistics).metadataStatistics(tableStatistics).partitionKeys(Collections.emptyMap()).build();
}
return tableMetadata;
}
Aggregations