use of org.apache.drill.metastore.statistics.StatisticsKind in project drill by apache.
the class MetadataControllerBatch method getColumnStatistics.
private Map<SchemaPath, ColumnStatistics<?>> getColumnStatistics(TupleReader reader, TupleMetadata columnMetadata, Long rowCount) {
Multimap<String, StatisticsHolder<?>> columnStatistics = ArrayListMultimap.create();
Map<String, TypeProtos.MinorType> columnTypes = new HashMap<>();
for (ColumnMetadata column : columnMetadata) {
if (AnalyzeColumnUtils.isColumnStatisticsField(column.name())) {
String fieldName = AnalyzeColumnUtils.getColumnName(column.name());
StatisticsKind<?> statisticsKind = AnalyzeColumnUtils.getStatisticsKind(column.name());
columnStatistics.put(fieldName, new StatisticsHolder<>(getConvertedColumnValue(reader.column(column.name())), statisticsKind));
if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MIN_VALUE.getName()) || statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MAX_VALUE.getName())) {
columnTypes.putIfAbsent(fieldName, column.type());
}
}
}
// adds NON_NULL_COUNT to use it during filter pushdown
if (rowCount != null) {
Map<String, StatisticsHolder<?>> nullsCountColumnStatistics = new HashMap<>();
columnStatistics.asMap().forEach((key, value) -> value.stream().filter(statisticsHolder -> statisticsHolder.getStatisticsKind() == ColumnStatisticsKind.NON_NULL_VALUES_COUNT).findAny().map(statisticsHolder -> (Long) statisticsHolder.getStatisticsValue()).ifPresent(nonNullCount -> nullsCountColumnStatistics.put(key, new StatisticsHolder<>(rowCount - nonNullCount, ColumnStatisticsKind.NULLS_COUNT))));
nullsCountColumnStatistics.forEach(columnStatistics::put);
}
Map<SchemaPath, ColumnStatistics<?>> resultingStats = new HashMap<>();
columnStatistics.asMap().forEach((fieldName, statisticsHolders) -> resultingStats.put(SchemaPath.parseFromString(fieldName), new ColumnStatistics<>(statisticsHolders, columnTypes.get(fieldName))));
return resultingStats;
}
use of org.apache.drill.metastore.statistics.StatisticsKind in project drill by apache.
the class ConvertMetadataAggregateToDirectScanRule method populateRecords.
/**
* Populates records list with row group metadata.
*/
private DirectGroupScan populateRecords(Collection<SchemaPath> interestingColumns, Map<String, Class<?>> schema, DrillScanRel scan, ColumnNamesOptions columnNamesOptions) throws IOException {
ParquetGroupScan parquetGroupScan = (ParquetGroupScan) scan.getGroupScan();
DrillTable drillTable = Utilities.getDrillTable(scan.getTable());
Multimap<Path, RowGroupMetadata> rowGroupsMetadataMap = parquetGroupScan.getMetadataProvider().getRowGroupsMetadataMap();
Table<String, Integer, Object> recordsTable = HashBasedTable.create();
FormatSelection selection = (FormatSelection) drillTable.getSelection();
List<String> partitionColumnNames = ColumnExplorer.getPartitionColumnNames(selection.getSelection(), columnNamesOptions);
FileSystem rawFs = selection.getSelection().getSelectionRoot().getFileSystem(new Configuration());
DrillFileSystem fileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), rawFs.getConf());
int rowIndex = 0;
for (Map.Entry<Path, RowGroupMetadata> rgEntry : rowGroupsMetadataMap.entries()) {
Path path = rgEntry.getKey();
RowGroupMetadata rowGroupMetadata = rgEntry.getValue();
List<String> partitionValues = ColumnExplorer.listPartitionValues(path, selection.getSelection().getSelectionRoot(), false);
for (int i = 0; i < partitionValues.size(); i++) {
String partitionColumnName = partitionColumnNames.get(i);
recordsTable.put(partitionColumnName, rowIndex, partitionValues.get(i));
}
recordsTable.put(MetastoreAnalyzeConstants.LOCATION_FIELD, rowIndex, ImplicitFileColumns.FQN.getValue(path));
recordsTable.put(columnNamesOptions.rowGroupIndex(), rowIndex, String.valueOf(rowGroupMetadata.getRowGroupIndex()));
if (interestingColumns == null) {
interestingColumns = rowGroupMetadata.getColumnsStatistics().keySet();
}
// populates record list with row group column metadata
for (SchemaPath schemaPath : interestingColumns) {
ColumnStatistics<?> columnStatistics = rowGroupMetadata.getColumnsStatistics().get(schemaPath);
// do not gather statistics for array columns as it is not supported by Metastore
if (containsArrayColumn(rowGroupMetadata.getSchema(), schemaPath)) {
continue;
}
if (IsPredicate.isNullOrEmpty(columnStatistics)) {
logger.debug("Statistics for {} column wasn't found within {} row group.", schemaPath, path);
return null;
}
for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet()) {
Object statsValue;
if (statisticsKind.getName().equalsIgnoreCase(TableStatisticsKind.ROW_COUNT.getName())) {
statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata);
} else if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.NON_NULL_VALUES_COUNT.getName())) {
statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata) - ColumnStatisticsKind.NULLS_COUNT.getFrom(columnStatistics);
} else {
statsValue = columnStatistics.get(statisticsKind);
}
String columnStatisticsFieldName = AnalyzeColumnUtils.getColumnStatisticsFieldName(schemaPath.toExpr(), statisticsKind);
if (statsValue != null) {
schema.putIfAbsent(columnStatisticsFieldName, statsValue.getClass());
recordsTable.put(columnStatisticsFieldName, rowIndex, statsValue);
} else {
recordsTable.put(columnStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
}
}
}
// populates record list with row group metadata
for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet()) {
String metadataStatisticsFieldName = AnalyzeColumnUtils.getMetadataStatisticsFieldName(statisticsKind);
Object statisticsValue = rowGroupMetadata.getStatistic(statisticsKind);
if (statisticsValue != null) {
schema.putIfAbsent(metadataStatisticsFieldName, statisticsValue.getClass());
recordsTable.put(metadataStatisticsFieldName, rowIndex, statisticsValue);
} else {
recordsTable.put(metadataStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
}
}
// populates record list internal columns
recordsTable.put(MetastoreAnalyzeConstants.SCHEMA_FIELD, rowIndex, rowGroupMetadata.getSchema().jsonString());
recordsTable.put(columnNamesOptions.rowGroupStart(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.START)));
recordsTable.put(columnNamesOptions.rowGroupLength(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
recordsTable.put(columnNamesOptions.lastModifiedTime(), rowIndex, String.valueOf(fileSystem.getFileStatus(path).getModificationTime()));
rowIndex++;
}
// DynamicPojoRecordReader requires LinkedHashMap with fields order
// which corresponds to the value position in record list.
LinkedHashMap<String, Class<?>> orderedSchema = new LinkedHashMap<>();
for (String s : recordsTable.rowKeySet()) {
Class<?> clazz = schema.get(s);
if (clazz != null) {
orderedSchema.put(s, clazz);
} else {
return null;
}
}
IntFunction<List<Object>> collectRecord = currentIndex -> orderedSchema.keySet().stream().map(column -> recordsTable.get(column, currentIndex)).map(value -> value != BaseParquetMetadataProvider.NULL_VALUE ? value : null).collect(Collectors.toList());
List<List<Object>> records = IntStream.range(0, rowIndex).mapToObj(collectRecord).collect(Collectors.toList());
DynamicPojoRecordReader<?> reader = new DynamicPojoRecordReader<>(orderedSchema, records);
ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, records.size(), 1, schema.size());
return new DirectGroupScan(reader, scanStats);
}
Aggregations