use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class FilterEvaluatorUtils method evalFilter.
@SuppressWarnings("RedundantTypeArguments")
public static RowsMatch evalFilter(LogicalExpression expr, MetadataBase.ParquetTableMetadataBase footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext) {
// Specifies type arguments explicitly to avoid compilation error caused by JDK-8066974
List<SchemaPath> schemaPathsInExpr = new ArrayList<>(expr.<Set<SchemaPath>, Void, RuntimeException>accept(FilterEvaluatorUtils.FieldReferenceFinder.INSTANCE, null));
RowGroupMetadata rowGroupMetadata = new ArrayList<>(ParquetTableMetadataUtils.getRowGroupsMetadata(footer).values()).get(rowGroupIndex);
NonInterestingColumnsMetadata nonInterestingColumnsMetadata = ParquetTableMetadataUtils.getNonInterestingColumnsMeta(footer);
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = rowGroupMetadata.getColumnsStatistics();
// Add column statistics of non-interesting columns if there are any
columnsStatistics.putAll(nonInterestingColumnsMetadata.getColumnsStatistics());
columnsStatistics = ParquetTableMetadataUtils.addImplicitColumnsStatistics(columnsStatistics, schemaPathsInExpr, Collections.emptyList(), options, rowGroupMetadata.getPath(), true);
return matches(expr, columnsStatistics, rowGroupMetadata.getSchema(), TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata), fragmentContext, fragmentContext.getFunctionRegistry(), new HashSet<>(schemaPathsInExpr));
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class ParquetTableMetadataUtils method getRowGroupMetadata.
/**
* Returns {@link RowGroupMetadata} instance converted from specified parquet {@code rowGroupMetadata}.
*
* @param tableMetadata table metadata which contains row group metadata to convert
* @param rowGroupMetadata row group metadata to convert
* @param rgIndexInFile index of current row group within the file
* @param location location of file with current row group
* @return {@link RowGroupMetadata} instance converted from specified parquet {@code rowGroupMetadata}
*/
public static RowGroupMetadata getRowGroupMetadata(MetadataBase.ParquetTableMetadataBase tableMetadata, MetadataBase.RowGroupMetadata rowGroupMetadata, int rgIndexInFile, Path location) {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = getRowGroupColumnStatistics(tableMetadata, rowGroupMetadata);
List<StatisticsHolder<?>> rowGroupStatistics = new ArrayList<>();
rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getRowCount(), TableStatisticsKind.ROW_COUNT));
rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getStart(), new BaseStatisticsKind<>(ExactStatisticsConstants.START, true)));
rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getLength(), new BaseStatisticsKind<>(ExactStatisticsConstants.LENGTH, true)));
Map<SchemaPath, TypeProtos.MajorType> columns = getRowGroupFields(tableMetadata, rowGroupMetadata);
Map<SchemaPath, TypeProtos.MajorType> intermediateColumns = getIntermediateFields(tableMetadata, rowGroupMetadata);
TupleMetadata schema = new TupleSchema();
columns.forEach((schemaPath, majorType) -> SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateColumns));
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.ROW_GROUP).build();
return RowGroupMetadata.builder().tableInfo(TableInfo.UNKNOWN_TABLE_INFO).metadataInfo(metadataInfo).schema(schema).columnsStatistics(columnsStatistics).metadataStatistics(rowGroupStatistics).hostAffinity(rowGroupMetadata.getHostAffinity()).rowGroupIndex(rgIndexInFile).path(location).build();
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class MetadataControllerBatch method getRowGroupMetadata.
private RowGroupMetadata getRowGroupMetadata(TupleReader reader, List<StatisticsHolder<?>> metadataStatistics, Map<SchemaPath, ColumnStatistics<?>> columnStatistics, int nestingLevel) {
List<String> segmentColumns = popConfig.getContext().segmentColumns();
String segmentKey = segmentColumns.size() > 0 ? reader.column(segmentColumns.iterator().next()).scalar().getString() : MetadataInfo.DEFAULT_SEGMENT_KEY;
List<String> partitionValues = segmentColumns.stream().limit(nestingLevel - 2).map(columnName -> reader.column(columnName).scalar().getString()).collect(Collectors.toList());
Path path = new Path(reader.column(MetastoreAnalyzeConstants.LOCATION_FIELD).scalar().getString());
int rowGroupIndex = Integer.parseInt(reader.column(columnNamesOptions.rowGroupIndex()).scalar().getString());
String metadataIdentifier = MetadataIdentifierUtils.getRowGroupMetadataIdentifier(partitionValues, path, rowGroupIndex);
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.ROW_GROUP).key(segmentKey).identifier(StringUtils.defaultIfEmpty(metadataIdentifier, null)).build();
return RowGroupMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(columnStatistics).metadataStatistics(metadataStatistics).hostAffinity(Collections.emptyMap()).rowGroupIndex(rowGroupIndex).path(path).lastModifiedTime(Long.parseLong(reader.column(columnNamesOptions.lastModifiedTime()).scalar().getString())).schema(TupleMetadata.of(reader.column(MetastoreAnalyzeConstants.SCHEMA_FIELD).scalar().getString())).build();
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class MetadataControllerBatch method getSegmentMetadata.
private SegmentMetadata getSegmentMetadata(TupleReader reader, List<StatisticsHolder<?>> metadataStatistics, Map<SchemaPath, ColumnStatistics<?>> columnStatistics, int nestingLevel) {
List<String> segmentColumns = popConfig.getContext().segmentColumns();
String segmentKey = segmentColumns.size() > 0 ? reader.column(segmentColumns.iterator().next()).scalar().getString() : MetadataInfo.DEFAULT_SEGMENT_KEY;
// and therefore all values should be used when forming metadata identifier
if (popConfig.getContext().multiValueSegments()) {
nestingLevel = segmentColumns.size();
}
List<String> allPartitionValues = segmentColumns.stream().limit(nestingLevel).map(columnName -> reader.column(columnName).scalar().getString()).collect(Collectors.toList());
String metadataIdentifier = MetadataIdentifierUtils.getMetadataIdentifierKey(allPartitionValues);
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.SEGMENT).key(segmentKey).identifier(StringUtils.defaultIfEmpty(metadataIdentifier, null)).build();
int segmentLevel = nestingLevel - 1;
// for the case of multi-value segments, there is no nesting,
// so all partition column values should be used
List<String> partitionValues = popConfig.getContext().multiValueSegments() ? allPartitionValues : Collections.singletonList(allPartitionValues.get(segmentLevel));
return SegmentMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(columnStatistics).metadataStatistics(metadataStatistics).path(new Path(reader.column(MetastoreAnalyzeConstants.LOCATION_FIELD).scalar().getString())).locations(getIncomingLocations(reader)).column(segmentColumns.size() > 0 ? SchemaPath.getSimplePath(segmentColumns.get(segmentLevel)) : null).partitionValues(partitionValues).lastModifiedTime(Long.parseLong(reader.column(columnNamesOptions.lastModifiedTime()).scalar().getString())).schema(TupleMetadata.of(reader.column(MetastoreAnalyzeConstants.SCHEMA_FIELD).scalar().getString())).build();
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class MetadataControllerBatch method getPartitionMetadata.
private PartitionMetadata getPartitionMetadata(TupleReader reader, List<StatisticsHolder<?>> metadataStatistics, Map<SchemaPath, ColumnStatistics<?>> columnStatistics, int nestingLevel) {
List<String> segmentColumns = popConfig.getContext().segmentColumns();
String segmentKey = segmentColumns.size() > 0 ? reader.column(segmentColumns.iterator().next()).scalar().getString() : MetadataInfo.DEFAULT_SEGMENT_KEY;
List<String> partitionValues = segmentColumns.stream().limit(nestingLevel).map(columnName -> reader.column(columnName).scalar().getString()).collect(Collectors.toList());
String metadataIdentifier = MetadataIdentifierUtils.getMetadataIdentifierKey(partitionValues);
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).key(segmentKey).identifier(StringUtils.defaultIfEmpty(metadataIdentifier, null)).build();
return PartitionMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(columnStatistics).metadataStatistics(metadataStatistics).locations(getIncomingLocations(reader)).lastModifiedTime(Long.parseLong(reader.column(columnNamesOptions.lastModifiedTime()).scalar().getString())).schema(TupleMetadata.of(reader.column(MetastoreAnalyzeConstants.SCHEMA_FIELD).scalar().getString())).build();
}
Aggregations