use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class ParquetTableMetadataUtils method getFileMetadata.
/**
* Returns {@link FileMetadata} instance received by merging specified {@link RowGroupMetadata} list.
*
* @param rowGroups collection of {@link RowGroupMetadata} to be merged
* @return {@link FileMetadata} instance
*/
public static FileMetadata getFileMetadata(Collection<RowGroupMetadata> rowGroups) {
if (rowGroups.isEmpty()) {
return null;
}
List<StatisticsHolder<?>> fileStatistics = new ArrayList<>();
fileStatistics.add(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(rowGroups), TableStatisticsKind.ROW_COUNT));
RowGroupMetadata rowGroupMetadata = rowGroups.iterator().next();
TupleMetadata schema = rowGroupMetadata.getSchema();
Set<SchemaPath> columns = rowGroupMetadata.getColumnsStatistics().keySet();
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.FILE).build();
return FileMetadata.builder().tableInfo(rowGroupMetadata.getTableInfo()).metadataInfo(metadataInfo).path(rowGroupMetadata.getPath()).schema(schema).columnsStatistics(TableMetadataUtils.mergeColumnsStatistics(rowGroups, columns, PARQUET_COLUMN_STATISTICS)).metadataStatistics(fileStatistics).build();
}
use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class BaseParquetMetadataProvider method getPartitionsMetadata.
@Override
public List<PartitionMetadata> getPartitionsMetadata() {
if (partitions == null) {
partitions = new ArrayList<>();
if (collectMetadata) {
Table<SchemaPath, Object, List<FileMetadata>> colValFile = HashBasedTable.create();
Collection<FileMetadata> filesMetadata = getFilesMetadataMap().values();
partitionColumns = getParquetGroupScanStatistics().getPartitionColumns();
for (FileMetadata fileMetadata : filesMetadata) {
for (SchemaPath partitionColumn : partitionColumns) {
Object partitionValue = getParquetGroupScanStatistics().getPartitionValue(fileMetadata.getPath(), partitionColumn);
// Table cannot contain nulls
partitionValue = partitionValue == null ? NULL_VALUE : partitionValue;
List<FileMetadata> partitionFiles = colValFile.get(partitionColumn, partitionValue);
if (partitionFiles == null) {
partitionFiles = new ArrayList<>();
colValFile.put(partitionColumn, partitionValue, partitionFiles);
}
partitionFiles.add(fileMetadata);
}
}
for (SchemaPath logicalExpressions : colValFile.rowKeySet()) {
for (List<FileMetadata> partValues : colValFile.row(logicalExpressions).values()) {
partitions.add(ParquetTableMetadataUtils.getPartitionMetadata(logicalExpressions, partValues));
}
}
} else {
for (SchemaPath partitionColumn : getParquetGroupScanStatistics().getPartitionColumns()) {
Map<Path, Object> partitionPaths = getParquetGroupScanStatistics().getPartitionPaths(partitionColumn);
Multimap<Object, Path> partitionsForValue = HashMultimap.create();
partitionPaths.forEach((path, value) -> partitionsForValue.put(value, path));
partitionsForValue.asMap().forEach((partitionKey, value) -> {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
List<StatisticsHolder<?>> statistics = new ArrayList<>();
partitionKey = partitionKey == NULL_VALUE ? null : partitionKey;
statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MIN_VALUE));
statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MAX_VALUE));
statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, TableStatisticsKind.ROW_COUNT));
columnsStatistics.put(partitionColumn, new ColumnStatistics<>(statistics, getParquetGroupScanStatistics().getTypeForColumn(partitionColumn).getMinorType()));
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).build();
TableMetadata tableMetadata = getTableMetadata();
PartitionMetadata partitionMetadata = PartitionMetadata.builder().tableInfo(tableMetadata.getTableInfo()).metadataInfo(metadataInfo).column(partitionColumn).schema(tableMetadata.getSchema()).columnsStatistics(columnsStatistics).metadataStatistics(statistics).partitionValues(Collections.emptyList()).locations(new HashSet<>(value)).build();
partitions.add(partitionMetadata);
});
}
}
}
return partitions;
}
use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class BaseParquetMetadataProvider method getTableMetadata.
@Override
public TableMetadata getTableMetadata() {
if (tableMetadata == null) {
List<StatisticsHolder<?>> tableStatistics = new ArrayList<>(DrillStatsTable.getEstimatedTableStats(statsTable));
Map<SchemaPath, TypeProtos.MajorType> fields = ParquetTableMetadataUtils.resolveFields(parquetTableMetadata);
Map<SchemaPath, TypeProtos.MajorType> intermediateFields = ParquetTableMetadataUtils.resolveIntermediateFields(parquetTableMetadata);
if (this.schema == null) {
schema = new TupleSchema();
fields.forEach((schemaPath, majorType) -> SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateFields));
} else {
// merges specified schema with schema from table
fields.forEach((schemaPath, majorType) -> {
if (SchemaPathUtils.getColumnMetadata(schemaPath, schema) == null) {
SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateFields);
}
});
}
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics;
if (collectMetadata) {
Collection<? extends BaseMetadata> metadata = getFilesMetadataMap().values();
if (metadata.isEmpty()) {
metadata = getRowGroupsMeta();
}
tableStatistics.add(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(metadata), TableStatisticsKind.ROW_COUNT));
columnsStatistics = TableMetadataUtils.mergeColumnsStatistics(metadata, fields.keySet(), PARQUET_COLUMN_STATISTICS);
} else {
columnsStatistics = new HashMap<>();
tableStatistics.add(new StatisticsHolder<>(getParquetGroupScanStatistics().getRowCount(), TableStatisticsKind.ROW_COUNT));
Set<SchemaPath> unhandledColumns = new HashSet<>();
if (statsTable != null && statsTable.isMaterialized()) {
unhandledColumns.addAll(statsTable.getColumns());
}
fields.forEach((columnPath, value) -> {
long columnValueCount = getParquetGroupScanStatistics().getColumnValueCount(columnPath);
// Adds statistics values itself if statistics is available
List<StatisticsHolder<?>> stats = new ArrayList<>(DrillStatsTable.getEstimatedColumnStats(statsTable, columnPath));
unhandledColumns.remove(columnPath);
// adds statistics for partition columns
stats.add(new StatisticsHolder<>(columnValueCount, TableStatisticsKind.ROW_COUNT));
stats.add(new StatisticsHolder<>(getParquetGroupScanStatistics().getRowCount() - columnValueCount, ColumnStatisticsKind.NULLS_COUNT));
columnsStatistics.put(columnPath, new ColumnStatistics<>(stats, value.getMinorType()));
});
for (SchemaPath column : unhandledColumns) {
columnsStatistics.put(column, new ColumnStatistics<>(DrillStatsTable.getEstimatedColumnStats(statsTable, column)));
}
}
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.TABLE).build();
tableMetadata = BaseTableMetadata.builder().tableInfo(TableInfo.UNKNOWN_TABLE_INFO).metadataInfo(metadataInfo).location(tableLocation).schema(schema).columnsStatistics(columnsStatistics).metadataStatistics(tableStatistics).partitionKeys(Collections.emptyMap()).build();
}
return tableMetadata;
}
Aggregations