use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.
the class AbstractParquetGroupScan method applyLimit.
// filter push down methods block end
// limit push down methods start
@Override
public GroupScan applyLimit(int maxRecords) {
// Make sure it request at least 1 row -> 1 rowGroup.
maxRecords = Math.max(maxRecords, 1);
if (getTableMetadata() != null) {
long tableRowCount = TableStatisticsKind.ROW_COUNT.getValue(getTableMetadata());
if (tableRowCount == Statistic.NO_COLUMN_STATS || tableRowCount <= maxRecords) {
logger.debug("limit push down does not apply, since total number of rows [{}] is less or equal to the required [{}].", tableRowCount, maxRecords);
return null;
}
}
List<RowGroupMetadata> qualifiedRowGroups = limitMetadata(getRowGroupsMetadata().values(), maxRecords);
if (qualifiedRowGroups == null || getRowGroupsMetadata().size() == qualifiedRowGroups.size()) {
logger.debug("limit push down does not apply, since number of row groups was not reduced.");
return null;
}
Map<Path, FileMetadata> filesMetadata = getFilesMetadata();
Map<Path, FileMetadata> qualifiedFiles = qualifiedRowGroups.stream().map(rowGroup -> filesMetadata.get(rowGroup.getPath())).filter(Objects::nonNull).collect(Collectors.toMap(FileMetadata::getPath, Function.identity()));
Multimap<Path, RowGroupMetadata> prunedRowGroups = LinkedListMultimap.create();
for (RowGroupMetadata qualifiedRowGroup : qualifiedRowGroups) {
prunedRowGroups.put(qualifiedRowGroup.getPath(), qualifiedRowGroup);
}
return getFilterer().rowGroups(prunedRowGroups).table(tableMetadata).partitions(partitions).segments(segments).files(qualifiedFiles).nonInterestingColumns(nonInterestingColumnsMetadata).matching(matchAllMetadata).build();
}
use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.
the class AbstractParquetGroupScan method modifyFileSelection.
// limit push down methods end
// helper method used for partition pruning and filter push down
@Override
public void modifyFileSelection(FileSelection selection) {
super.modifyFileSelection(selection);
List<Path> files = selection.getFiles();
fileSet = new HashSet<>(files);
entries = new ArrayList<>(files.size());
entries.addAll(files.stream().map(ReadEntryWithPath::new).collect(Collectors.toList()));
Multimap<Path, RowGroupMetadata> newRowGroups = LinkedListMultimap.create();
if (!getRowGroupsMetadata().isEmpty()) {
getRowGroupsMetadata().entries().stream().filter(entry -> fileSet.contains(entry.getKey())).forEachOrdered(entry -> newRowGroups.put(entry.getKey(), entry.getValue()));
}
this.rowGroups = newRowGroups;
tableMetadata = TableMetadataUtils.updateRowCount(getTableMetadata(), getRowGroupsMetadata().values());
if (!getFilesMetadata().isEmpty()) {
this.files = getFilesMetadata().entrySet().stream().filter(entry -> fileSet.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
} else {
this.files = Collections.emptyMap();
}
List<PartitionMetadata> newPartitions = new ArrayList<>();
if (!getPartitionsMetadata().isEmpty()) {
for (PartitionMetadata entry : getPartitionsMetadata()) {
for (Path partLocation : entry.getLocations()) {
if (fileSet.contains(partLocation)) {
newPartitions.add(entry);
break;
}
}
}
}
partitions = newPartitions;
if (!getSegmentsMetadata().isEmpty()) {
this.segments = getSegmentsMetadata().entrySet().stream().filter(entry -> fileSet.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}
rowGroupInfos = null;
}
use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.
the class FilterEvaluatorUtils method evalFilter.
@SuppressWarnings("RedundantTypeArguments")
public static RowsMatch evalFilter(LogicalExpression expr, MetadataBase.ParquetTableMetadataBase footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext) {
// Specifies type arguments explicitly to avoid compilation error caused by JDK-8066974
List<SchemaPath> schemaPathsInExpr = new ArrayList<>(expr.<Set<SchemaPath>, Void, RuntimeException>accept(FilterEvaluatorUtils.FieldReferenceFinder.INSTANCE, null));
RowGroupMetadata rowGroupMetadata = new ArrayList<>(ParquetTableMetadataUtils.getRowGroupsMetadata(footer).values()).get(rowGroupIndex);
NonInterestingColumnsMetadata nonInterestingColumnsMetadata = ParquetTableMetadataUtils.getNonInterestingColumnsMeta(footer);
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = rowGroupMetadata.getColumnsStatistics();
// Add column statistics of non-interesting columns if there are any
columnsStatistics.putAll(nonInterestingColumnsMetadata.getColumnsStatistics());
columnsStatistics = ParquetTableMetadataUtils.addImplicitColumnsStatistics(columnsStatistics, schemaPathsInExpr, Collections.emptyList(), options, rowGroupMetadata.getPath(), true);
return matches(expr, columnsStatistics, rowGroupMetadata.getSchema(), TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata), fragmentContext, fragmentContext.getFunctionRegistry(), new HashSet<>(schemaPathsInExpr));
}
use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.
the class BasicTablesTransformer method all.
public static MetadataHolder all(List<TableMetadataUnit> units) {
List<BaseTableMetadata> tables = new ArrayList<>();
List<SegmentMetadata> segments = new ArrayList<>();
List<FileMetadata> files = new ArrayList<>();
List<RowGroupMetadata> rowGroups = new ArrayList<>();
List<PartitionMetadata> partitions = new ArrayList<>();
for (TableMetadataUnit unit : units) {
MetadataType metadataType = MetadataType.fromValue(unit.metadataType());
if (metadataType == null) {
continue;
}
switch(metadataType) {
case TABLE:
tables.add(BaseTableMetadata.builder().metadataUnit(unit).build());
break;
case SEGMENT:
segments.add(SegmentMetadata.builder().metadataUnit(unit).build());
break;
case FILE:
files.add(FileMetadata.builder().metadataUnit(unit).build());
break;
case ROW_GROUP:
rowGroups.add(RowGroupMetadata.builder().metadataUnit(unit).build());
break;
case PARTITION:
partitions.add(PartitionMetadata.builder().metadataUnit(unit).build());
break;
default:
// Ignore unsupported type
break;
}
}
return new MetadataHolder(tables, segments, files, rowGroups, partitions);
}
use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.
the class MetadataControllerBatch method getRowGroupMetadata.
private RowGroupMetadata getRowGroupMetadata(TupleReader reader, List<StatisticsHolder<?>> metadataStatistics, Map<SchemaPath, ColumnStatistics<?>> columnStatistics, int nestingLevel) {
List<String> segmentColumns = popConfig.getContext().segmentColumns();
String segmentKey = segmentColumns.size() > 0 ? reader.column(segmentColumns.iterator().next()).scalar().getString() : MetadataInfo.DEFAULT_SEGMENT_KEY;
List<String> partitionValues = segmentColumns.stream().limit(nestingLevel - 2).map(columnName -> reader.column(columnName).scalar().getString()).collect(Collectors.toList());
Path path = new Path(reader.column(MetastoreAnalyzeConstants.LOCATION_FIELD).scalar().getString());
int rowGroupIndex = Integer.parseInt(reader.column(columnNamesOptions.rowGroupIndex()).scalar().getString());
String metadataIdentifier = MetadataIdentifierUtils.getRowGroupMetadataIdentifier(partitionValues, path, rowGroupIndex);
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.ROW_GROUP).key(segmentKey).identifier(StringUtils.defaultIfEmpty(metadataIdentifier, null)).build();
return RowGroupMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(columnStatistics).metadataStatistics(metadataStatistics).hostAffinity(Collections.emptyMap()).rowGroupIndex(rowGroupIndex).path(path).lastModifiedTime(Long.parseLong(reader.column(columnNamesOptions.lastModifiedTime()).scalar().getString())).schema(TupleMetadata.of(reader.column(MetastoreAnalyzeConstants.SCHEMA_FIELD).scalar().getString())).build();
}
Aggregations