Search in sources :

Example 1 with Table

use of org.apache.drill.shaded.guava.com.google.common.collect.Table in project drill by apache.

the class BaseParquetMetadataProvider method init.

protected void init(BaseParquetMetadataProvider metadataProvider) throws IOException {
    // Once deserialization for metadata is provided, initInternal() call should be removed
    // and only files list is deserialized based on specified locations
    initInternal();
    assert parquetTableMetadata != null;
    if (fileSet == null) {
        fileSet = new HashSet<>();
        fileSet.addAll(parquetTableMetadata.getFiles().stream().map(MetadataBase.ParquetFileMetadata::getPath).collect(Collectors.toSet()));
    }
    List<Path> fileLocations = getLocations();
    // obtains metadata from cache files or table footers
    if (metadataProvider == null || (metadataProvider.rowGroups != null && !metadataProvider.rowGroups.keySet().containsAll(fileLocations)) || (metadataProvider.files != null && !metadataProvider.files.keySet().containsAll(fileLocations))) {
        initializeMetadata();
    } else {
        // reuse metadata from existing TableMetadataProvider
        if (metadataProvider.files != null && metadataProvider.files.size() != files.size()) {
            files = metadataProvider.files.entrySet().stream().filter(entry -> fileLocations.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
        }
        if (metadataProvider.rowGroups != null) {
            rowGroups = LinkedListMultimap.create();
            metadataProvider.rowGroups.entries().stream().filter(entry -> fileLocations.contains(entry.getKey())).forEach(entry -> rowGroups.put(entry.getKey(), entry.getValue()));
        }
        TableMetadata tableMetadata = getTableMetadata();
        getSegmentsMetadataMap();
        getPartitionsMetadata();
        getRowGroupsMeta();
        getNonInterestingColumnsMetadata();
        this.tableMetadata = TableMetadataUtils.updateRowCount(tableMetadata, getRowGroupsMeta());
        parquetTableMetadata = null;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataType(org.apache.drill.metastore.metadata.MetadataType) LoggerFactory(org.slf4j.LoggerFactory) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) TupleSchema(org.apache.drill.exec.record.metadata.TupleSchema) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) SchemaProvider(org.apache.drill.exec.record.metadata.schema.SchemaProvider) HashMultimap(org.apache.drill.shaded.guava.com.google.common.collect.HashMultimap) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) Set(java.util.Set) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) MetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) TypeProtos(org.apache.drill.common.types.TypeProtos) List(java.util.List) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataProviderManager(org.apache.drill.exec.metastore.MetadataProviderManager) TableMetadataUtils(org.apache.drill.metastore.util.TableMetadataUtils) TableInfo(org.apache.drill.metastore.metadata.TableInfo) Table(org.apache.drill.shaded.guava.com.google.common.collect.Table) HashMap(java.util.HashMap) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) NonInterestingColumnsMetadata(org.apache.drill.metastore.metadata.NonInterestingColumnsMetadata) DrillStatsTable(org.apache.drill.exec.planner.common.DrillStatsTable) TableMetadata(org.apache.drill.metastore.metadata.TableMetadata) PARQUET_COLUMN_STATISTICS(org.apache.drill.exec.store.parquet.ParquetTableMetadataUtils.PARQUET_COLUMN_STATISTICS) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) SchemaPathUtils(org.apache.drill.metastore.util.SchemaPathUtils) Logger(org.slf4j.Logger) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) HashBasedTable(org.apache.drill.shaded.guava.com.google.common.collect.HashBasedTable) IOException(java.io.IOException) ParquetMetadataProvider(org.apache.drill.exec.metastore.store.parquet.ParquetMetadataProvider) LinkedListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.LinkedListMultimap) Statistic(org.apache.drill.metastore.statistics.Statistic) LocationProvider(org.apache.drill.metastore.metadata.LocationProvider) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) ParquetMetadataProviderBuilder(org.apache.drill.exec.metastore.store.parquet.ParquetMetadataProviderBuilder) Collections(java.util.Collections) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) TableMetadata(org.apache.drill.metastore.metadata.TableMetadata)

Example 2 with Table

use of org.apache.drill.shaded.guava.com.google.common.collect.Table in project drill by apache.

the class ConvertMetadataAggregateToDirectScanRule method populateRecords.

/**
 * Populates records list with row group metadata.
 */
private DirectGroupScan populateRecords(Collection<SchemaPath> interestingColumns, Map<String, Class<?>> schema, DrillScanRel scan, ColumnNamesOptions columnNamesOptions) throws IOException {
    ParquetGroupScan parquetGroupScan = (ParquetGroupScan) scan.getGroupScan();
    DrillTable drillTable = Utilities.getDrillTable(scan.getTable());
    Multimap<Path, RowGroupMetadata> rowGroupsMetadataMap = parquetGroupScan.getMetadataProvider().getRowGroupsMetadataMap();
    Table<String, Integer, Object> recordsTable = HashBasedTable.create();
    FormatSelection selection = (FormatSelection) drillTable.getSelection();
    List<String> partitionColumnNames = ColumnExplorer.getPartitionColumnNames(selection.getSelection(), columnNamesOptions);
    FileSystem rawFs = selection.getSelection().getSelectionRoot().getFileSystem(new Configuration());
    DrillFileSystem fileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), rawFs.getConf());
    int rowIndex = 0;
    for (Map.Entry<Path, RowGroupMetadata> rgEntry : rowGroupsMetadataMap.entries()) {
        Path path = rgEntry.getKey();
        RowGroupMetadata rowGroupMetadata = rgEntry.getValue();
        List<String> partitionValues = ColumnExplorer.listPartitionValues(path, selection.getSelection().getSelectionRoot(), false);
        for (int i = 0; i < partitionValues.size(); i++) {
            String partitionColumnName = partitionColumnNames.get(i);
            recordsTable.put(partitionColumnName, rowIndex, partitionValues.get(i));
        }
        recordsTable.put(MetastoreAnalyzeConstants.LOCATION_FIELD, rowIndex, ImplicitFileColumns.FQN.getValue(path));
        recordsTable.put(columnNamesOptions.rowGroupIndex(), rowIndex, String.valueOf(rowGroupMetadata.getRowGroupIndex()));
        if (interestingColumns == null) {
            interestingColumns = rowGroupMetadata.getColumnsStatistics().keySet();
        }
        // populates record list with row group column metadata
        for (SchemaPath schemaPath : interestingColumns) {
            ColumnStatistics<?> columnStatistics = rowGroupMetadata.getColumnsStatistics().get(schemaPath);
            // do not gather statistics for array columns as it is not supported by Metastore
            if (containsArrayColumn(rowGroupMetadata.getSchema(), schemaPath)) {
                continue;
            }
            if (IsPredicate.isNullOrEmpty(columnStatistics)) {
                logger.debug("Statistics for {} column wasn't found within {} row group.", schemaPath, path);
                return null;
            }
            for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet()) {
                Object statsValue;
                if (statisticsKind.getName().equalsIgnoreCase(TableStatisticsKind.ROW_COUNT.getName())) {
                    statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata);
                } else if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.NON_NULL_VALUES_COUNT.getName())) {
                    statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata) - ColumnStatisticsKind.NULLS_COUNT.getFrom(columnStatistics);
                } else {
                    statsValue = columnStatistics.get(statisticsKind);
                }
                String columnStatisticsFieldName = AnalyzeColumnUtils.getColumnStatisticsFieldName(schemaPath.toExpr(), statisticsKind);
                if (statsValue != null) {
                    schema.putIfAbsent(columnStatisticsFieldName, statsValue.getClass());
                    recordsTable.put(columnStatisticsFieldName, rowIndex, statsValue);
                } else {
                    recordsTable.put(columnStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
                }
            }
        }
        // populates record list with row group metadata
        for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet()) {
            String metadataStatisticsFieldName = AnalyzeColumnUtils.getMetadataStatisticsFieldName(statisticsKind);
            Object statisticsValue = rowGroupMetadata.getStatistic(statisticsKind);
            if (statisticsValue != null) {
                schema.putIfAbsent(metadataStatisticsFieldName, statisticsValue.getClass());
                recordsTable.put(metadataStatisticsFieldName, rowIndex, statisticsValue);
            } else {
                recordsTable.put(metadataStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
            }
        }
        // populates record list internal columns
        recordsTable.put(MetastoreAnalyzeConstants.SCHEMA_FIELD, rowIndex, rowGroupMetadata.getSchema().jsonString());
        recordsTable.put(columnNamesOptions.rowGroupStart(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.START)));
        recordsTable.put(columnNamesOptions.rowGroupLength(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
        recordsTable.put(columnNamesOptions.lastModifiedTime(), rowIndex, String.valueOf(fileSystem.getFileStatus(path).getModificationTime()));
        rowIndex++;
    }
    // DynamicPojoRecordReader requires LinkedHashMap with fields order
    // which corresponds to the value position in record list.
    LinkedHashMap<String, Class<?>> orderedSchema = new LinkedHashMap<>();
    for (String s : recordsTable.rowKeySet()) {
        Class<?> clazz = schema.get(s);
        if (clazz != null) {
            orderedSchema.put(s, clazz);
        } else {
            return null;
        }
    }
    IntFunction<List<Object>> collectRecord = currentIndex -> orderedSchema.keySet().stream().map(column -> recordsTable.get(column, currentIndex)).map(value -> value != BaseParquetMetadataProvider.NULL_VALUE ? value : null).collect(Collectors.toList());
    List<List<Object>> records = IntStream.range(0, rowIndex).mapToObj(collectRecord).collect(Collectors.toList());
    DynamicPojoRecordReader<?> reader = new DynamicPojoRecordReader<>(orderedSchema, records);
    ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, records.size(), 1, schema.size());
    return new DirectGroupScan(reader, scanStats);
}
Also used : MetadataType(org.apache.drill.metastore.metadata.MetadataType) FileSystem(org.apache.hadoop.fs.FileSystem) IsPredicate(org.apache.drill.exec.expr.IsPredicate) LoggerFactory(org.slf4j.LoggerFactory) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) DictColumnMetadata(org.apache.drill.exec.record.metadata.DictColumnMetadata) PathSegment(org.apache.drill.common.expression.PathSegment) Utilities(org.apache.drill.exec.util.Utilities) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) BaseParquetMetadataProvider(org.apache.drill.exec.store.parquet.BaseParquetMetadataProvider) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) ImplicitFileColumns(org.apache.drill.exec.store.ColumnExplorer.ImplicitFileColumns) Collectors(java.util.stream.Collectors) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) List(java.util.List) MetadataAggregateContext(org.apache.drill.exec.metastore.analyze.MetadataAggregateContext) IntStream(java.util.stream.IntStream) Table(org.apache.drill.shaded.guava.com.google.common.collect.Table) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) Function(java.util.function.Function) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) LinkedHashMap(java.util.LinkedHashMap) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) ImpersonationUtil(org.apache.drill.exec.util.ImpersonationUtil) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) ParquetGroupScan(org.apache.drill.exec.store.parquet.ParquetGroupScan) IntFunction(java.util.function.IntFunction) PrelUtil(org.apache.drill.exec.planner.physical.PrelUtil) Logger(org.slf4j.Logger) ScanStats(org.apache.drill.exec.physical.base.ScanStats) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) StatisticsKind(org.apache.drill.metastore.statistics.StatisticsKind) HashBasedTable(org.apache.drill.shaded.guava.com.google.common.collect.HashBasedTable) IOException(java.io.IOException) RelNode(org.apache.calcite.rel.RelNode) RelOptRuleCall(org.apache.calcite.plan.RelOptRuleCall) DirectGroupScan(org.apache.drill.exec.store.direct.DirectGroupScan) RelOptRule(org.apache.calcite.plan.RelOptRule) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) GroupScan(org.apache.drill.exec.physical.base.GroupScan) AnalyzeColumnUtils(org.apache.drill.exec.metastore.analyze.AnalyzeColumnUtils) Configuration(org.apache.hadoop.conf.Configuration) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) LinkedHashMap(java.util.LinkedHashMap) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) SchemaPath(org.apache.drill.common.expression.SchemaPath) FileSystem(org.apache.hadoop.fs.FileSystem) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) List(java.util.List) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) DirectGroupScan(org.apache.drill.exec.store.direct.DirectGroupScan) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) ParquetGroupScan(org.apache.drill.exec.store.parquet.ParquetGroupScan) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ScanStats(org.apache.drill.exec.physical.base.ScanStats)

Aggregations

IOException (java.io.IOException)2 Collection (java.util.Collection)2 LinkedHashMap (java.util.LinkedHashMap)2 List (java.util.List)2 Map (java.util.Map)2 Function (java.util.function.Function)2 Collectors (java.util.stream.Collectors)2 SchemaPath (org.apache.drill.common.expression.SchemaPath)2 TupleMetadata (org.apache.drill.exec.record.metadata.TupleMetadata)2 MetadataType (org.apache.drill.metastore.metadata.MetadataType)2 RowGroupMetadata (org.apache.drill.metastore.metadata.RowGroupMetadata)2 ColumnStatistics (org.apache.drill.metastore.statistics.ColumnStatistics)2 ColumnStatisticsKind (org.apache.drill.metastore.statistics.ColumnStatisticsKind)2 ArrayList (java.util.ArrayList)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Objects (java.util.Objects)1 Set (java.util.Set)1 IntFunction (java.util.function.IntFunction)1