Search in sources :

Example 1 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class AbstractParquetGroupScan method applyLimit.

// filter push down methods block end
// limit push down methods start
@Override
public GroupScan applyLimit(int maxRecords) {
    // Make sure it request at least 1 row -> 1 rowGroup.
    maxRecords = Math.max(maxRecords, 1);
    if (getTableMetadata() != null) {
        long tableRowCount = TableStatisticsKind.ROW_COUNT.getValue(getTableMetadata());
        if (tableRowCount == Statistic.NO_COLUMN_STATS || tableRowCount <= maxRecords) {
            logger.debug("limit push down does not apply, since total number of rows [{}] is less or equal to the required [{}].", tableRowCount, maxRecords);
            return null;
        }
    }
    List<RowGroupMetadata> qualifiedRowGroups = limitMetadata(getRowGroupsMetadata().values(), maxRecords);
    if (qualifiedRowGroups == null || getRowGroupsMetadata().size() == qualifiedRowGroups.size()) {
        logger.debug("limit push down does not apply, since number of row groups was not reduced.");
        return null;
    }
    Map<Path, FileMetadata> filesMetadata = getFilesMetadata();
    Map<Path, FileMetadata> qualifiedFiles = qualifiedRowGroups.stream().map(rowGroup -> filesMetadata.get(rowGroup.getPath())).filter(Objects::nonNull).collect(Collectors.toMap(FileMetadata::getPath, Function.identity()));
    Multimap<Path, RowGroupMetadata> prunedRowGroups = LinkedListMultimap.create();
    for (RowGroupMetadata qualifiedRowGroup : qualifiedRowGroups) {
        prunedRowGroups.put(qualifiedRowGroup.getPath(), qualifiedRowGroup);
    }
    return getFilterer().rowGroups(prunedRowGroups).table(tableMetadata).partitions(partitions).segments(segments).files(qualifiedFiles).nonInterestingColumns(nonInterestingColumnsMetadata).matching(matchAllMetadata).build();
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Example 2 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class AbstractParquetGroupScan method modifyFileSelection.

// limit push down methods end
// helper method used for partition pruning and filter push down
@Override
public void modifyFileSelection(FileSelection selection) {
    super.modifyFileSelection(selection);
    List<Path> files = selection.getFiles();
    fileSet = new HashSet<>(files);
    entries = new ArrayList<>(files.size());
    entries.addAll(files.stream().map(ReadEntryWithPath::new).collect(Collectors.toList()));
    Multimap<Path, RowGroupMetadata> newRowGroups = LinkedListMultimap.create();
    if (!getRowGroupsMetadata().isEmpty()) {
        getRowGroupsMetadata().entries().stream().filter(entry -> fileSet.contains(entry.getKey())).forEachOrdered(entry -> newRowGroups.put(entry.getKey(), entry.getValue()));
    }
    this.rowGroups = newRowGroups;
    tableMetadata = TableMetadataUtils.updateRowCount(getTableMetadata(), getRowGroupsMetadata().values());
    if (!getFilesMetadata().isEmpty()) {
        this.files = getFilesMetadata().entrySet().stream().filter(entry -> fileSet.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    } else {
        this.files = Collections.emptyMap();
    }
    List<PartitionMetadata> newPartitions = new ArrayList<>();
    if (!getPartitionsMetadata().isEmpty()) {
        for (PartitionMetadata entry : getPartitionsMetadata()) {
            for (Path partLocation : entry.getLocations()) {
                if (fileSet.contains(partLocation)) {
                    newPartitions.add(entry);
                    break;
                }
            }
        }
    }
    partitions = newPartitions;
    if (!getSegmentsMetadata().isEmpty()) {
        this.segments = getSegmentsMetadata().entrySet().stream().filter(entry -> fileSet.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    }
    rowGroupInfos = null;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataType(org.apache.drill.metastore.metadata.MetadataType) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) LoggerFactory(org.slf4j.LoggerFactory) ArrayListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap) FunctionImplementationRegistry(org.apache.drill.exec.expr.fn.FunctionImplementationRegistry) ExpressionStringBuilder(org.apache.drill.common.expression.ExpressionStringBuilder) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) ListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap) UdfUtilities(org.apache.drill.exec.ops.UdfUtilities) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) Collection(java.util.Collection) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) Set(java.util.Set) LogicalExpression(org.apache.drill.common.expression.LogicalExpression) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) AbstractGroupScanWithMetadata(org.apache.drill.exec.physical.base.AbstractGroupScanWithMetadata) List(java.util.List) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataProviderManager(org.apache.drill.exec.metastore.MetadataProviderManager) TableMetadataUtils(org.apache.drill.metastore.util.TableMetadataUtils) FilterPredicate(org.apache.drill.exec.expr.FilterPredicate) OptionManager(org.apache.drill.exec.server.options.OptionManager) HashMap(java.util.HashMap) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) CoordinationProtos(org.apache.drill.exec.proto.CoordinationProtos) AffinityCreator(org.apache.drill.exec.store.schedule.AffinityCreator) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl) CollectionUtils(org.apache.commons.collections.CollectionUtils) JsonIgnore(com.fasterxml.jackson.annotation.JsonIgnore) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) Logger(org.slf4j.Logger) MapUtils(org.apache.commons.collections.MapUtils) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) IOException(java.io.IOException) ParquetMetadataProvider(org.apache.drill.exec.metastore.store.parquet.ParquetMetadataProvider) LinkedListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.LinkedListMultimap) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) Statistic(org.apache.drill.metastore.statistics.Statistic) LocationProvider(org.apache.drill.metastore.metadata.LocationProvider) EndpointAffinity(org.apache.drill.exec.physical.EndpointAffinity) GroupScan(org.apache.drill.exec.physical.base.GroupScan) JsonInclude(com.fasterxml.jackson.annotation.JsonInclude) Collections(java.util.Collections) ParquetMetadataProviderBuilder(org.apache.drill.exec.metastore.store.parquet.ParquetMetadataProviderBuilder) AssignmentCreator(org.apache.drill.exec.store.schedule.AssignmentCreator) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ArrayList(java.util.ArrayList) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Example 3 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class FilterEvaluatorUtils method evalFilter.

@SuppressWarnings("RedundantTypeArguments")
public static RowsMatch evalFilter(LogicalExpression expr, MetadataBase.ParquetTableMetadataBase footer, int rowGroupIndex, OptionManager options, FragmentContext fragmentContext) {
    // Specifies type arguments explicitly to avoid compilation error caused by JDK-8066974
    List<SchemaPath> schemaPathsInExpr = new ArrayList<>(expr.<Set<SchemaPath>, Void, RuntimeException>accept(FilterEvaluatorUtils.FieldReferenceFinder.INSTANCE, null));
    RowGroupMetadata rowGroupMetadata = new ArrayList<>(ParquetTableMetadataUtils.getRowGroupsMetadata(footer).values()).get(rowGroupIndex);
    NonInterestingColumnsMetadata nonInterestingColumnsMetadata = ParquetTableMetadataUtils.getNonInterestingColumnsMeta(footer);
    Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = rowGroupMetadata.getColumnsStatistics();
    // Add column statistics of non-interesting columns if there are any
    columnsStatistics.putAll(nonInterestingColumnsMetadata.getColumnsStatistics());
    columnsStatistics = ParquetTableMetadataUtils.addImplicitColumnsStatistics(columnsStatistics, schemaPathsInExpr, Collections.emptyList(), options, rowGroupMetadata.getPath(), true);
    return matches(expr, columnsStatistics, rowGroupMetadata.getSchema(), TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata), fragmentContext, fragmentContext.getFunctionRegistry(), new HashSet<>(schemaPathsInExpr));
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) NonInterestingColumnsMetadata(org.apache.drill.metastore.metadata.NonInterestingColumnsMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) ArrayList(java.util.ArrayList) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Example 4 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class BasicTablesTransformer method all.

public static MetadataHolder all(List<TableMetadataUnit> units) {
    List<BaseTableMetadata> tables = new ArrayList<>();
    List<SegmentMetadata> segments = new ArrayList<>();
    List<FileMetadata> files = new ArrayList<>();
    List<RowGroupMetadata> rowGroups = new ArrayList<>();
    List<PartitionMetadata> partitions = new ArrayList<>();
    for (TableMetadataUnit unit : units) {
        MetadataType metadataType = MetadataType.fromValue(unit.metadataType());
        if (metadataType == null) {
            continue;
        }
        switch(metadataType) {
            case TABLE:
                tables.add(BaseTableMetadata.builder().metadataUnit(unit).build());
                break;
            case SEGMENT:
                segments.add(SegmentMetadata.builder().metadataUnit(unit).build());
                break;
            case FILE:
                files.add(FileMetadata.builder().metadataUnit(unit).build());
                break;
            case ROW_GROUP:
                rowGroups.add(RowGroupMetadata.builder().metadataUnit(unit).build());
                break;
            case PARTITION:
                partitions.add(PartitionMetadata.builder().metadataUnit(unit).build());
                break;
            default:
                // Ignore unsupported type
                break;
        }
    }
    return new MetadataHolder(tables, segments, files, rowGroups, partitions);
}
Also used : ArrayList(java.util.ArrayList) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) MetadataType(org.apache.drill.metastore.metadata.MetadataType) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata)

Example 5 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class MetadataControllerBatch method getRowGroupMetadata.

private RowGroupMetadata getRowGroupMetadata(TupleReader reader, List<StatisticsHolder<?>> metadataStatistics, Map<SchemaPath, ColumnStatistics<?>> columnStatistics, int nestingLevel) {
    List<String> segmentColumns = popConfig.getContext().segmentColumns();
    String segmentKey = segmentColumns.size() > 0 ? reader.column(segmentColumns.iterator().next()).scalar().getString() : MetadataInfo.DEFAULT_SEGMENT_KEY;
    List<String> partitionValues = segmentColumns.stream().limit(nestingLevel - 2).map(columnName -> reader.column(columnName).scalar().getString()).collect(Collectors.toList());
    Path path = new Path(reader.column(MetastoreAnalyzeConstants.LOCATION_FIELD).scalar().getString());
    int rowGroupIndex = Integer.parseInt(reader.column(columnNamesOptions.rowGroupIndex()).scalar().getString());
    String metadataIdentifier = MetadataIdentifierUtils.getRowGroupMetadataIdentifier(partitionValues, path, rowGroupIndex);
    MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.ROW_GROUP).key(segmentKey).identifier(StringUtils.defaultIfEmpty(metadataIdentifier, null)).build();
    return RowGroupMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(columnStatistics).metadataStatistics(metadataStatistics).hostAffinity(Collections.emptyMap()).rowGroupIndex(rowGroupIndex).path(path).lastModifiedTime(Long.parseLong(reader.column(columnNamesOptions.lastModifiedTime()).scalar().getString())).schema(TupleMetadata.of(reader.column(MetastoreAnalyzeConstants.SCHEMA_FIELD).scalar().getString())).build();
}
Also used : MetadataType(org.apache.drill.metastore.metadata.MetadataType) BaseStatisticsKind(org.apache.drill.metastore.statistics.BaseStatisticsKind) MetastoreColumn(org.apache.drill.metastore.MetastoreColumn) UserException(org.apache.drill.common.exceptions.UserException) LoggerFactory(org.slf4j.LoggerFactory) Types(org.apache.drill.common.types.Types) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) RowSetReader(org.apache.drill.exec.physical.rowSet.RowSetReader) VectorContainer(org.apache.drill.exec.record.VectorContainer) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) ArrayListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap) StringUtils(org.apache.commons.lang3.StringUtils) ArrayReader(org.apache.drill.exec.vector.accessor.ArrayReader) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) StatisticsRecordWriterImpl(org.apache.drill.exec.store.StatisticsRecordWriterImpl) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) Map(java.util.Map) FieldConverter(org.apache.drill.exec.store.EventBasedRecordWriter.FieldConverter) Path(org.apache.hadoop.fs.Path) BatchSchema(org.apache.drill.exec.record.BatchSchema) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) Delete(org.apache.drill.metastore.operate.Delete) TableMetadataUnit(org.apache.drill.metastore.components.tables.TableMetadataUnit) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) RecordBatch(org.apache.drill.exec.record.RecordBatch) Set(java.util.Set) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) Collectors(java.util.stream.Collectors) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) FieldReader(org.apache.drill.exec.vector.complex.reader.FieldReader) TypeProtos(org.apache.drill.common.types.TypeProtos) List(java.util.List) AbstractBinaryRecordBatch(org.apache.drill.exec.record.AbstractBinaryRecordBatch) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) ObjectReader(org.apache.drill.exec.vector.accessor.ObjectReader) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetadataIdentifierUtils(org.apache.drill.exec.metastore.analyze.MetadataIdentifierUtils) TupleReader(org.apache.drill.exec.vector.accessor.TupleReader) Modify(org.apache.drill.metastore.operate.Modify) MetadataControllerContext(org.apache.drill.exec.metastore.analyze.MetadataControllerContext) HashMap(java.util.HashMap) BitVector(org.apache.drill.exec.vector.BitVector) Function(java.util.function.Function) VectorWrapper(org.apache.drill.exec.record.VectorWrapper) ArrayList(java.util.ArrayList) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) HashSet(java.util.HashSet) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) DirectRowSet(org.apache.drill.exec.physical.rowSet.DirectRowSet) DrillStatsTable(org.apache.drill.exec.planner.common.DrillStatsTable) WriterPrel(org.apache.drill.exec.planner.physical.WriterPrel) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) FragmentContext(org.apache.drill.exec.ops.FragmentContext) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) StatisticsRecordCollector(org.apache.drill.exec.store.StatisticsRecordCollector) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) Logger(org.slf4j.Logger) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) StatisticsKind(org.apache.drill.metastore.statistics.StatisticsKind) IOException(java.io.IOException) FilterExpression(org.apache.drill.metastore.expressions.FilterExpression) StatisticsCollectorImpl(org.apache.drill.exec.store.easy.json.StatisticsCollectorImpl) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) ParquetTableMetadataUtils(org.apache.drill.exec.store.parquet.ParquetTableMetadataUtils) VarCharVector(org.apache.drill.exec.vector.VarCharVector) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) MetadataControllerPOP(org.apache.drill.exec.physical.config.MetadataControllerPOP) Tables(org.apache.drill.metastore.components.tables.Tables) Collections(java.util.Collections) AnalyzeColumnUtils(org.apache.drill.exec.metastore.analyze.AnalyzeColumnUtils) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) ObjectType(org.apache.drill.exec.vector.accessor.ObjectType) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo)

Aggregations

RowGroupMetadata (org.apache.drill.metastore.metadata.RowGroupMetadata)24 SchemaPath (org.apache.drill.common.expression.SchemaPath)18 Path (org.apache.hadoop.fs.Path)16 FileMetadata (org.apache.drill.metastore.metadata.FileMetadata)15 BaseTableMetadata (org.apache.drill.metastore.metadata.BaseTableMetadata)13 TableInfo (org.apache.drill.metastore.metadata.TableInfo)13 MetastoreTest (org.apache.drill.categories.MetastoreTest)12 MetastoreTableInfo (org.apache.drill.metastore.components.tables.MetastoreTableInfo)12 Test (org.junit.Test)12 SlowTest (org.apache.drill.categories.SlowTest)11 SegmentMetadata (org.apache.drill.metastore.metadata.SegmentMetadata)11 ClusterTest (org.apache.drill.test.ClusterTest)11 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)11 File (java.io.File)10 ColumnStatistics (org.apache.drill.metastore.statistics.ColumnStatistics)9 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 TupleMetadata (org.apache.drill.exec.record.metadata.TupleMetadata)7 MetadataType (org.apache.drill.metastore.metadata.MetadataType)7 List (java.util.List)6