use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class MetadataControllerBatch method getMetadataStatistics.
private List<StatisticsHolder<?>> getMetadataStatistics(TupleReader reader, TupleMetadata columnMetadata) {
List<StatisticsHolder<?>> metadataStatistics = new ArrayList<>();
String rgs = columnNamesOptions.rowGroupStart();
String rgl = columnNamesOptions.rowGroupLength();
for (ColumnMetadata column : columnMetadata) {
String columnName = column.name();
ObjectReader objectReader = reader.column(columnName);
if (AnalyzeColumnUtils.isMetadataStatisticsField(columnName)) {
metadataStatistics.add(new StatisticsHolder<>(objectReader.getObject(), AnalyzeColumnUtils.getStatisticsKind(columnName)));
} else if (!objectReader.isNull()) {
if (columnName.equals(rgs)) {
metadataStatistics.add(new StatisticsHolder<>(Long.parseLong(objectReader.scalar().getString()), new BaseStatisticsKind<>(ExactStatisticsConstants.START, true)));
} else if (columnName.equals(rgl)) {
metadataStatistics.add(new StatisticsHolder<>(Long.parseLong(objectReader.scalar().getString()), new BaseStatisticsKind<>(ExactStatisticsConstants.LENGTH, true)));
}
}
}
return metadataStatistics;
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class MetadataControllerBatch method getTableMetadata.
private BaseTableMetadata getTableMetadata(TupleReader reader, List<StatisticsHolder<?>> metadataStatistics, Map<SchemaPath, ColumnStatistics<?>> columnStatistics) {
List<StatisticsHolder<?>> updatedMetaStats = new ArrayList<>(metadataStatistics);
updatedMetaStats.add(new StatisticsHolder<>(popConfig.getContext().analyzeMetadataLevel(), TableStatisticsKind.ANALYZE_METADATA_LEVEL));
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.TABLE).key(MetadataInfo.GENERAL_INFO_KEY).build();
BaseTableMetadata tableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(columnStatistics).metadataStatistics(updatedMetaStats).partitionKeys(Collections.emptyMap()).interestingColumns(popConfig.getContext().interestingColumns()).location(popConfig.getContext().location()).lastModifiedTime(Long.parseLong(reader.column(columnNamesOptions.lastModifiedTime()).scalar().getString())).schema(TupleMetadata.of(reader.column(MetastoreAnalyzeConstants.SCHEMA_FIELD).scalar().getString())).build();
if (context.getOptions().getOption(PlannerSettings.STATISTICS_USE)) {
DrillStatsTable statistics = new DrillStatsTable(statisticsCollector.getStatistics());
Map<SchemaPath, ColumnStatistics<?>> tableColumnStatistics = ParquetTableMetadataUtils.getColumnStatistics(tableMetadata.getSchema(), statistics);
tableMetadata = tableMetadata.cloneWithStats(tableColumnStatistics, DrillStatsTable.getEstimatedTableStats(statistics));
}
return tableMetadata;
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class MetadataControllerBatch method getMetadataUnits.
private List<TableMetadataUnit> getMetadataUnits(TupleReader reader, int nestingLevel) {
List<TableMetadataUnit> metadataUnits = new ArrayList<>();
TupleMetadata columnMetadata = reader.tupleSchema();
ObjectReader metadataColumnReader = reader.column(MetastoreAnalyzeConstants.METADATA_TYPE);
Preconditions.checkNotNull(metadataColumnReader, "metadataType column wasn't found");
ObjectReader underlyingMetadataReader = reader.column(MetastoreAnalyzeConstants.COLLECTED_MAP_FIELD);
if (underlyingMetadataReader != null) {
if (!underlyingMetadataReader.schema().isArray()) {
throw new IllegalStateException("Incoming vector with name `collected_map` should be repeated map");
}
// current row contains information about underlying metadata
ArrayReader array = underlyingMetadataReader.array();
while (array.next()) {
metadataUnits.addAll(getMetadataUnits(array.tuple(), nestingLevel + 1));
}
}
List<StatisticsHolder<?>> metadataStatistics = getMetadataStatistics(reader, columnMetadata);
Long rowCount = (Long) metadataStatistics.stream().filter(statisticsHolder -> statisticsHolder.getStatisticsKind() == TableStatisticsKind.ROW_COUNT).findAny().map(StatisticsHolder::getStatisticsValue).orElse(null);
Map<SchemaPath, ColumnStatistics<?>> columnStatistics = getColumnStatistics(reader, columnMetadata, rowCount);
MetadataType metadataType = MetadataType.valueOf(metadataColumnReader.scalar().getString());
BaseMetadata metadata;
switch(metadataType) {
case TABLE:
{
metadata = getTableMetadata(reader, metadataStatistics, columnStatistics);
break;
}
case SEGMENT:
{
metadata = getSegmentMetadata(reader, metadataStatistics, columnStatistics, nestingLevel);
break;
}
case PARTITION:
{
metadata = getPartitionMetadata(reader, metadataStatistics, columnStatistics, nestingLevel);
break;
}
case FILE:
{
metadata = getFileMetadata(reader, metadataStatistics, columnStatistics, nestingLevel);
break;
}
case ROW_GROUP:
{
metadata = getRowGroupMetadata(reader, metadataStatistics, columnStatistics, nestingLevel);
break;
}
default:
throw new UnsupportedOperationException("Unsupported metadata type: " + metadataType);
}
metadataUnits.add(metadata.toMetadataUnit());
return metadataUnits;
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class MetadataControllerBatch method getColumnStatistics.
private Map<SchemaPath, ColumnStatistics<?>> getColumnStatistics(TupleReader reader, TupleMetadata columnMetadata, Long rowCount) {
Multimap<String, StatisticsHolder<?>> columnStatistics = ArrayListMultimap.create();
Map<String, TypeProtos.MinorType> columnTypes = new HashMap<>();
for (ColumnMetadata column : columnMetadata) {
if (AnalyzeColumnUtils.isColumnStatisticsField(column.name())) {
String fieldName = AnalyzeColumnUtils.getColumnName(column.name());
StatisticsKind<?> statisticsKind = AnalyzeColumnUtils.getStatisticsKind(column.name());
columnStatistics.put(fieldName, new StatisticsHolder<>(getConvertedColumnValue(reader.column(column.name())), statisticsKind));
if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MIN_VALUE.getName()) || statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MAX_VALUE.getName())) {
columnTypes.putIfAbsent(fieldName, column.type());
}
}
}
// adds NON_NULL_COUNT to use it during filter pushdown
if (rowCount != null) {
Map<String, StatisticsHolder<?>> nullsCountColumnStatistics = new HashMap<>();
columnStatistics.asMap().forEach((key, value) -> value.stream().filter(statisticsHolder -> statisticsHolder.getStatisticsKind() == ColumnStatisticsKind.NON_NULL_VALUES_COUNT).findAny().map(statisticsHolder -> (Long) statisticsHolder.getStatisticsValue()).ifPresent(nonNullCount -> nullsCountColumnStatistics.put(key, new StatisticsHolder<>(rowCount - nonNullCount, ColumnStatisticsKind.NULLS_COUNT))));
nullsCountColumnStatistics.forEach(columnStatistics::put);
}
Map<SchemaPath, ColumnStatistics<?>> resultingStats = new HashMap<>();
columnStatistics.asMap().forEach((fieldName, statisticsHolders) -> resultingStats.put(SchemaPath.parseFromString(fieldName), new ColumnStatistics<>(statisticsHolders, columnTypes.get(fieldName))));
return resultingStats;
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class ParquetTableMetadataUtils method getRowGroupColumnStatistics.
/**
* Converts specified {@link MetadataBase.RowGroupMetadata} into the map of {@link ColumnStatistics}
* instances with column names as keys.
*
* @param tableMetadata the source of column types
* @param rowGroupMetadata metadata to convert
* @return map with converted row group metadata
*/
public static Map<SchemaPath, ColumnStatistics<?>> getRowGroupColumnStatistics(MetadataBase.ParquetTableMetadataBase tableMetadata, MetadataBase.RowGroupMetadata rowGroupMetadata) {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
for (MetadataBase.ColumnMetadata column : rowGroupMetadata.getColumns()) {
SchemaPath colPath = SchemaPath.getCompoundPath(column.getName());
Long nulls = column.getNulls();
if (hasInvalidStatistics(column, tableMetadata)) {
nulls = Statistic.NO_COLUMN_STATS;
}
PrimitiveType.PrimitiveTypeName primitiveType = getPrimitiveTypeName(tableMetadata, column);
OriginalType originalType = getOriginalType(tableMetadata, column);
TypeProtos.MinorType type = ParquetReaderUtility.getMinorType(primitiveType, originalType);
List<StatisticsHolder<?>> statistics = new ArrayList<>();
statistics.add(new StatisticsHolder<>(getValue(column.getMinValue(), primitiveType, originalType), ColumnStatisticsKind.MIN_VALUE));
statistics.add(new StatisticsHolder<>(getValue(column.getMaxValue(), primitiveType, originalType), ColumnStatisticsKind.MAX_VALUE));
statistics.add(new StatisticsHolder<>(nulls, ColumnStatisticsKind.NULLS_COUNT));
columnsStatistics.put(colPath, new ColumnStatistics<>(statistics, type));
}
return columnsStatistics;
}
Aggregations