Search in sources :

Example 16 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class TestMetastoreCommands method testIncrementalAnalyzeWithDifferentMetadataLevel.

@Test
public void testIncrementalAnalyzeWithDifferentMetadataLevel() throws Exception {
    String tableName = "multilevel/parquetDifferentMetadataLevel";
    File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
    Path tablePath = new Path(table.toURI().getPath());
    TableInfo tableInfo = getTableInfo(tableName, "tmp");
    BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(TABLE_COLUMN_STATISTICS).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.FILE, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
    try {
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA 'file' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
        List<RowGroupMetadata> rowGroupMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
        assertEquals(expectedTableMetadata, actualTableMetadata);
        assertTrue(rowGroupMetadata.isEmpty());
        // checks that analyze was produced since metadata level more specific
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA 'row_group' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
        expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(TABLE_COLUMN_STATISTICS).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ROW_GROUP, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
        assertEquals(expectedTableMetadata, actualTableMetadata);
        rowGroupMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
        assertEquals(12, rowGroupMetadata.size());
    } finally {
        run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) File(java.io.File) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) ClusterTest(org.apache.drill.test.ClusterTest) SlowTest(org.apache.drill.categories.SlowTest) MetastoreTest(org.apache.drill.categories.MetastoreTest) Test(org.junit.Test)

Example 17 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class TestMetastoreCommands method testAnalyzeNonEmptyTableWithEmptyFile.

@Test
public void testAnalyzeNonEmptyTableWithEmptyFile() throws Exception {
    String tableName = "parquet_with_empty_file";
    File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("parquet", "empty", "simple"), Paths.get(tableName));
    TableInfo tableInfo = getTableInfo(tableName, "tmp");
    TupleMetadata schema = new SchemaBuilder().addNullable("id", TypeProtos.MinorType.BIGINT).addNullable("name", TypeProtos.MinorType.VARCHAR).build();
    Map<SchemaPath, ColumnStatistics<?>> columnStatistics = ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder().put(SchemaPath.getSimplePath("name"), getColumnStatistics("Tom", "Tom", 1L, TypeProtos.MinorType.VARCHAR)).put(SchemaPath.getSimplePath("id"), getColumnStatistics(2L, 2L, 1L, TypeProtos.MinorType.BIGINT)).build();
    BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(schema).location(new Path(table.toURI().getPath())).columnsStatistics(columnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(1L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
    try {
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        MetastoreTableInfo metastoreTableInfo = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().metastoreTableInfo(tableInfo);
        assertTrue("table metadata wasn't found", metastoreTableInfo.isExists());
        BaseTableMetadata tableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
        assertEquals(expectedTableMetadata, tableMetadata);
        List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
        assertEquals(2, filesMetadata.size());
        List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
        assertEquals(2, rowGroupsMetadata.size());
    } finally {
        run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
    }
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) SchemaBuilder(org.apache.drill.exec.record.metadata.SchemaBuilder) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) File(java.io.File) ClusterTest(org.apache.drill.test.ClusterTest) SlowTest(org.apache.drill.categories.SlowTest) MetastoreTest(org.apache.drill.categories.MetastoreTest) Test(org.junit.Test)

Example 18 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class MetadataHandlerBatch method writeMetadata.

private <T extends BaseMetadata & LocationProvider> VectorContainer writeMetadata(List<T> metadataList) {
    BaseMetadata firstElement = metadataList.iterator().next();
    ResultSetLoader resultSetLoader = getResultSetLoaderForMetadata(firstElement);
    resultSetLoader.startBatch();
    RowSetLoader rowWriter = resultSetLoader.writer();
    Iterator<T> segmentsIterator = metadataList.iterator();
    while (!rowWriter.isFull() && segmentsIterator.hasNext()) {
        T metadata = segmentsIterator.next();
        metadataToHandle.remove(metadata.getMetadataInfo().identifier());
        List<Object> arguments = new ArrayList<>();
        // adds required segment names to the arguments
        arguments.add(metadata.getPath().toUri().getPath());
        Collections.addAll(arguments, Arrays.copyOf(MetadataIdentifierUtils.getValuesFromMetadataIdentifier(metadata.getMetadataInfo().identifier()), popConfig.getContext().segmentColumns().size()));
        // adds column statistics values assuming that they are sorted in alphabetic order
        // (see getResultSetLoaderForMetadata() method)
        metadata.getColumnsStatistics().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().toExpr())).map(Map.Entry::getValue).flatMap(columnStatistics -> AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet().stream().map(columnStatistics::get)).forEach(arguments::add);
        AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet().stream().map(metadata::getStatistic).forEach(arguments::add);
        // collectedMap field value
        arguments.add(new Object[] {});
        if (metadataType == MetadataType.SEGMENT) {
            arguments.add(((SegmentMetadata) metadata).getLocations().stream().map(path -> path.toUri().getPath()).toArray(String[]::new));
        }
        if (metadataType == MetadataType.ROW_GROUP) {
            arguments.add(String.valueOf(((RowGroupMetadata) metadata).getRowGroupIndex()));
            arguments.add(Long.toString(metadata.getStatistic(() -> ExactStatisticsConstants.START)));
            arguments.add(Long.toString(metadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
        }
        arguments.add(metadata.getSchema().jsonString());
        arguments.add(String.valueOf(metadata.getLastModifiedTime()));
        arguments.add(metadataType.name());
        rowWriter.addRow(arguments.toArray());
    }
    return resultSetLoader.harvest();
}
Also used : AbstractSingleRecordBatch(org.apache.drill.exec.record.AbstractSingleRecordBatch) MetadataType(org.apache.drill.metastore.metadata.MetadataType) Arrays(java.util.Arrays) LoggerFactory(org.slf4j.LoggerFactory) Types(org.apache.drill.common.types.Types) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) RowSetReader(org.apache.drill.exec.physical.rowSet.RowSetReader) VectorContainer(org.apache.drill.exec.record.VectorContainer) SchemaBuilder(org.apache.drill.exec.record.metadata.SchemaBuilder) ResultSetLoaderImpl(org.apache.drill.exec.physical.resultSet.impl.ResultSetLoaderImpl) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) BatchSchema(org.apache.drill.exec.record.BatchSchema) BasicTablesRequests(org.apache.drill.metastore.components.tables.BasicTablesRequests) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) RecordBatch(org.apache.drill.exec.record.RecordBatch) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) Collectors(java.util.stream.Collectors) List(java.util.List) MinorType(org.apache.drill.common.types.TypeProtos.MinorType) ResultSetOptionBuilder(org.apache.drill.exec.physical.resultSet.impl.ResultSetOptionBuilder) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) MetadataIdentifierUtils(org.apache.drill.exec.metastore.analyze.MetadataIdentifierUtils) MaterializedField(org.apache.drill.exec.record.MaterializedField) VectorWrapper(org.apache.drill.exec.record.VectorWrapper) Function(java.util.function.Function) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) ArrayList(java.util.ArrayList) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) RowSetLoader(org.apache.drill.exec.physical.resultSet.RowSetLoader) DirectRowSet(org.apache.drill.exec.physical.rowSet.DirectRowSet) StreamSupport(java.util.stream.StreamSupport) NONE(org.apache.drill.exec.record.RecordBatch.IterOutcome.NONE) FragmentContext(org.apache.drill.exec.ops.FragmentContext) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) Logger(org.slf4j.Logger) ResultSetLoader(org.apache.drill.exec.physical.resultSet.ResultSetLoader) Iterator(java.util.Iterator) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) StatisticsKind(org.apache.drill.metastore.statistics.StatisticsKind) MetadataHandlerPOP(org.apache.drill.exec.physical.config.MetadataHandlerPOP) LocationProvider(org.apache.drill.metastore.metadata.LocationProvider) VarCharVector(org.apache.drill.exec.vector.VarCharVector) Tables(org.apache.drill.metastore.components.tables.Tables) Comparator(java.util.Comparator) AnalyzeColumnUtils(org.apache.drill.exec.metastore.analyze.AnalyzeColumnUtils) Collections(java.util.Collections) ArrayList(java.util.ArrayList) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) ResultSetLoader(org.apache.drill.exec.physical.resultSet.ResultSetLoader) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) RowSetLoader(org.apache.drill.exec.physical.resultSet.RowSetLoader)

Example 19 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class MetadataHandlerBatch method writeMetadataUsingBatchSchema.

private <T extends BaseMetadata & LocationProvider> VectorContainer writeMetadataUsingBatchSchema(List<T> metadataList) {
    Preconditions.checkArgument(!metadataList.isEmpty(), "Metadata list shouldn't be empty.");
    ResultSetLoader resultSetLoader = getResultSetLoaderWithBatchSchema();
    resultSetLoader.startBatch();
    RowSetLoader rowWriter = resultSetLoader.writer();
    Iterator<T> segmentsIterator = metadataList.iterator();
    while (!rowWriter.isFull() && segmentsIterator.hasNext()) {
        T metadata = segmentsIterator.next();
        metadataToHandle.remove(metadata.getMetadataInfo().identifier());
        List<Object> arguments = new ArrayList<>();
        for (VectorWrapper<?> vectorWrapper : container) {
            String[] identifierValues = Arrays.copyOf(MetadataIdentifierUtils.getValuesFromMetadataIdentifier(metadata.getMetadataInfo().identifier()), popConfig.getContext().segmentColumns().size());
            MaterializedField field = vectorWrapper.getField();
            String fieldName = field.getName();
            if (fieldName.equals(MetastoreAnalyzeConstants.LOCATION_FIELD)) {
                arguments.add(metadata.getPath().toUri().getPath());
            } else if (fieldName.equals(MetastoreAnalyzeConstants.LOCATIONS_FIELD)) {
                if (metadataType == MetadataType.SEGMENT) {
                    arguments.add(((SegmentMetadata) metadata).getLocations().stream().map(path -> path.toUri().getPath()).toArray(String[]::new));
                } else {
                    arguments.add(null);
                }
            } else if (popConfig.getContext().segmentColumns().contains(fieldName)) {
                arguments.add(identifierValues[popConfig.getContext().segmentColumns().indexOf(fieldName)]);
            } else if (AnalyzeColumnUtils.isColumnStatisticsField(fieldName)) {
                arguments.add(metadata.getColumnStatistics(SchemaPath.parseFromString(AnalyzeColumnUtils.getColumnName(fieldName))).get(AnalyzeColumnUtils.getStatisticsKind(fieldName)));
            } else if (AnalyzeColumnUtils.isMetadataStatisticsField(fieldName)) {
                arguments.add(metadata.getStatistic(AnalyzeColumnUtils.getStatisticsKind(fieldName)));
            } else if (fieldName.equals(MetastoreAnalyzeConstants.COLLECTED_MAP_FIELD)) {
                // collectedMap field value
                arguments.add(new Object[] {});
            } else if (fieldName.equals(MetastoreAnalyzeConstants.SCHEMA_FIELD)) {
                arguments.add(metadata.getSchema().jsonString());
            } else if (fieldName.equals(columnNamesOptions.lastModifiedTime())) {
                arguments.add(String.valueOf(metadata.getLastModifiedTime()));
            } else if (fieldName.equals(columnNamesOptions.rowGroupIndex())) {
                arguments.add(String.valueOf(((RowGroupMetadata) metadata).getRowGroupIndex()));
            } else if (fieldName.equals(columnNamesOptions.rowGroupStart())) {
                arguments.add(Long.toString(metadata.getStatistic(() -> ExactStatisticsConstants.START)));
            } else if (fieldName.equals(columnNamesOptions.rowGroupLength())) {
                arguments.add(Long.toString(metadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
            } else if (fieldName.equals(MetastoreAnalyzeConstants.METADATA_TYPE)) {
                arguments.add(metadataType.name());
            } else {
                throw new UnsupportedOperationException(String.format("Found unexpected field [%s] in incoming batch.", field));
            }
        }
        rowWriter.addRow(arguments.toArray());
    }
    return resultSetLoader.harvest();
}
Also used : AbstractSingleRecordBatch(org.apache.drill.exec.record.AbstractSingleRecordBatch) MetadataType(org.apache.drill.metastore.metadata.MetadataType) Arrays(java.util.Arrays) LoggerFactory(org.slf4j.LoggerFactory) Types(org.apache.drill.common.types.Types) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) RowSetReader(org.apache.drill.exec.physical.rowSet.RowSetReader) VectorContainer(org.apache.drill.exec.record.VectorContainer) SchemaBuilder(org.apache.drill.exec.record.metadata.SchemaBuilder) ResultSetLoaderImpl(org.apache.drill.exec.physical.resultSet.impl.ResultSetLoaderImpl) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) BatchSchema(org.apache.drill.exec.record.BatchSchema) BasicTablesRequests(org.apache.drill.metastore.components.tables.BasicTablesRequests) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) RecordBatch(org.apache.drill.exec.record.RecordBatch) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) Collectors(java.util.stream.Collectors) List(java.util.List) MinorType(org.apache.drill.common.types.TypeProtos.MinorType) ResultSetOptionBuilder(org.apache.drill.exec.physical.resultSet.impl.ResultSetOptionBuilder) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) MetadataIdentifierUtils(org.apache.drill.exec.metastore.analyze.MetadataIdentifierUtils) MaterializedField(org.apache.drill.exec.record.MaterializedField) VectorWrapper(org.apache.drill.exec.record.VectorWrapper) Function(java.util.function.Function) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) ArrayList(java.util.ArrayList) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) RowSetLoader(org.apache.drill.exec.physical.resultSet.RowSetLoader) DirectRowSet(org.apache.drill.exec.physical.rowSet.DirectRowSet) StreamSupport(java.util.stream.StreamSupport) NONE(org.apache.drill.exec.record.RecordBatch.IterOutcome.NONE) FragmentContext(org.apache.drill.exec.ops.FragmentContext) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) Logger(org.slf4j.Logger) ResultSetLoader(org.apache.drill.exec.physical.resultSet.ResultSetLoader) Iterator(java.util.Iterator) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) StatisticsKind(org.apache.drill.metastore.statistics.StatisticsKind) MetadataHandlerPOP(org.apache.drill.exec.physical.config.MetadataHandlerPOP) LocationProvider(org.apache.drill.metastore.metadata.LocationProvider) VarCharVector(org.apache.drill.exec.vector.VarCharVector) Tables(org.apache.drill.metastore.components.tables.Tables) Comparator(java.util.Comparator) AnalyzeColumnUtils(org.apache.drill.exec.metastore.analyze.AnalyzeColumnUtils) Collections(java.util.Collections) ArrayList(java.util.ArrayList) MaterializedField(org.apache.drill.exec.record.MaterializedField) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) ResultSetLoader(org.apache.drill.exec.physical.resultSet.ResultSetLoader) RowSetLoader(org.apache.drill.exec.physical.resultSet.RowSetLoader)

Example 20 with RowGroupMetadata

use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.

the class ConvertMetadataAggregateToDirectScanRule method populateRecords.

/**
 * Populates records list with row group metadata.
 */
private DirectGroupScan populateRecords(Collection<SchemaPath> interestingColumns, Map<String, Class<?>> schema, DrillScanRel scan, ColumnNamesOptions columnNamesOptions) throws IOException {
    ParquetGroupScan parquetGroupScan = (ParquetGroupScan) scan.getGroupScan();
    DrillTable drillTable = Utilities.getDrillTable(scan.getTable());
    Multimap<Path, RowGroupMetadata> rowGroupsMetadataMap = parquetGroupScan.getMetadataProvider().getRowGroupsMetadataMap();
    Table<String, Integer, Object> recordsTable = HashBasedTable.create();
    FormatSelection selection = (FormatSelection) drillTable.getSelection();
    List<String> partitionColumnNames = ColumnExplorer.getPartitionColumnNames(selection.getSelection(), columnNamesOptions);
    FileSystem rawFs = selection.getSelection().getSelectionRoot().getFileSystem(new Configuration());
    DrillFileSystem fileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), rawFs.getConf());
    int rowIndex = 0;
    for (Map.Entry<Path, RowGroupMetadata> rgEntry : rowGroupsMetadataMap.entries()) {
        Path path = rgEntry.getKey();
        RowGroupMetadata rowGroupMetadata = rgEntry.getValue();
        List<String> partitionValues = ColumnExplorer.listPartitionValues(path, selection.getSelection().getSelectionRoot(), false);
        for (int i = 0; i < partitionValues.size(); i++) {
            String partitionColumnName = partitionColumnNames.get(i);
            recordsTable.put(partitionColumnName, rowIndex, partitionValues.get(i));
        }
        recordsTable.put(MetastoreAnalyzeConstants.LOCATION_FIELD, rowIndex, ImplicitFileColumns.FQN.getValue(path));
        recordsTable.put(columnNamesOptions.rowGroupIndex(), rowIndex, String.valueOf(rowGroupMetadata.getRowGroupIndex()));
        if (interestingColumns == null) {
            interestingColumns = rowGroupMetadata.getColumnsStatistics().keySet();
        }
        // populates record list with row group column metadata
        for (SchemaPath schemaPath : interestingColumns) {
            ColumnStatistics<?> columnStatistics = rowGroupMetadata.getColumnsStatistics().get(schemaPath);
            // do not gather statistics for array columns as it is not supported by Metastore
            if (containsArrayColumn(rowGroupMetadata.getSchema(), schemaPath)) {
                continue;
            }
            if (IsPredicate.isNullOrEmpty(columnStatistics)) {
                logger.debug("Statistics for {} column wasn't found within {} row group.", schemaPath, path);
                return null;
            }
            for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet()) {
                Object statsValue;
                if (statisticsKind.getName().equalsIgnoreCase(TableStatisticsKind.ROW_COUNT.getName())) {
                    statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata);
                } else if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.NON_NULL_VALUES_COUNT.getName())) {
                    statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata) - ColumnStatisticsKind.NULLS_COUNT.getFrom(columnStatistics);
                } else {
                    statsValue = columnStatistics.get(statisticsKind);
                }
                String columnStatisticsFieldName = AnalyzeColumnUtils.getColumnStatisticsFieldName(schemaPath.toExpr(), statisticsKind);
                if (statsValue != null) {
                    schema.putIfAbsent(columnStatisticsFieldName, statsValue.getClass());
                    recordsTable.put(columnStatisticsFieldName, rowIndex, statsValue);
                } else {
                    recordsTable.put(columnStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
                }
            }
        }
        // populates record list with row group metadata
        for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet()) {
            String metadataStatisticsFieldName = AnalyzeColumnUtils.getMetadataStatisticsFieldName(statisticsKind);
            Object statisticsValue = rowGroupMetadata.getStatistic(statisticsKind);
            if (statisticsValue != null) {
                schema.putIfAbsent(metadataStatisticsFieldName, statisticsValue.getClass());
                recordsTable.put(metadataStatisticsFieldName, rowIndex, statisticsValue);
            } else {
                recordsTable.put(metadataStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
            }
        }
        // populates record list internal columns
        recordsTable.put(MetastoreAnalyzeConstants.SCHEMA_FIELD, rowIndex, rowGroupMetadata.getSchema().jsonString());
        recordsTable.put(columnNamesOptions.rowGroupStart(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.START)));
        recordsTable.put(columnNamesOptions.rowGroupLength(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
        recordsTable.put(columnNamesOptions.lastModifiedTime(), rowIndex, String.valueOf(fileSystem.getFileStatus(path).getModificationTime()));
        rowIndex++;
    }
    // DynamicPojoRecordReader requires LinkedHashMap with fields order
    // which corresponds to the value position in record list.
    LinkedHashMap<String, Class<?>> orderedSchema = new LinkedHashMap<>();
    for (String s : recordsTable.rowKeySet()) {
        Class<?> clazz = schema.get(s);
        if (clazz != null) {
            orderedSchema.put(s, clazz);
        } else {
            return null;
        }
    }
    IntFunction<List<Object>> collectRecord = currentIndex -> orderedSchema.keySet().stream().map(column -> recordsTable.get(column, currentIndex)).map(value -> value != BaseParquetMetadataProvider.NULL_VALUE ? value : null).collect(Collectors.toList());
    List<List<Object>> records = IntStream.range(0, rowIndex).mapToObj(collectRecord).collect(Collectors.toList());
    DynamicPojoRecordReader<?> reader = new DynamicPojoRecordReader<>(orderedSchema, records);
    ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, records.size(), 1, schema.size());
    return new DirectGroupScan(reader, scanStats);
}
Also used : MetadataType(org.apache.drill.metastore.metadata.MetadataType) FileSystem(org.apache.hadoop.fs.FileSystem) IsPredicate(org.apache.drill.exec.expr.IsPredicate) LoggerFactory(org.slf4j.LoggerFactory) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) DictColumnMetadata(org.apache.drill.exec.record.metadata.DictColumnMetadata) PathSegment(org.apache.drill.common.expression.PathSegment) Utilities(org.apache.drill.exec.util.Utilities) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) ColumnMetadata(org.apache.drill.exec.record.metadata.ColumnMetadata) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) BaseParquetMetadataProvider(org.apache.drill.exec.store.parquet.BaseParquetMetadataProvider) MetastoreAnalyzeConstants(org.apache.drill.exec.metastore.analyze.MetastoreAnalyzeConstants) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) ImplicitFileColumns(org.apache.drill.exec.store.ColumnExplorer.ImplicitFileColumns) Collectors(java.util.stream.Collectors) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) List(java.util.List) MetadataAggregateContext(org.apache.drill.exec.metastore.analyze.MetadataAggregateContext) IntStream(java.util.stream.IntStream) Table(org.apache.drill.shaded.guava.com.google.common.collect.Table) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) Function(java.util.function.Function) ColumnNamesOptions(org.apache.drill.exec.metastore.ColumnNamesOptions) LinkedHashMap(java.util.LinkedHashMap) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) ImpersonationUtil(org.apache.drill.exec.util.ImpersonationUtil) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) ParquetGroupScan(org.apache.drill.exec.store.parquet.ParquetGroupScan) IntFunction(java.util.function.IntFunction) PrelUtil(org.apache.drill.exec.planner.physical.PrelUtil) Logger(org.slf4j.Logger) ScanStats(org.apache.drill.exec.physical.base.ScanStats) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) StatisticsKind(org.apache.drill.metastore.statistics.StatisticsKind) HashBasedTable(org.apache.drill.shaded.guava.com.google.common.collect.HashBasedTable) IOException(java.io.IOException) RelNode(org.apache.calcite.rel.RelNode) RelOptRuleCall(org.apache.calcite.plan.RelOptRuleCall) DirectGroupScan(org.apache.drill.exec.store.direct.DirectGroupScan) RelOptRule(org.apache.calcite.plan.RelOptRule) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) GroupScan(org.apache.drill.exec.physical.base.GroupScan) AnalyzeColumnUtils(org.apache.drill.exec.metastore.analyze.AnalyzeColumnUtils) Configuration(org.apache.hadoop.conf.Configuration) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) LinkedHashMap(java.util.LinkedHashMap) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) SchemaPath(org.apache.drill.common.expression.SchemaPath) FileSystem(org.apache.hadoop.fs.FileSystem) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) List(java.util.List) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) DynamicPojoRecordReader(org.apache.drill.exec.store.pojo.DynamicPojoRecordReader) DirectGroupScan(org.apache.drill.exec.store.direct.DirectGroupScan) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) ParquetGroupScan(org.apache.drill.exec.store.parquet.ParquetGroupScan) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) ScanStats(org.apache.drill.exec.physical.base.ScanStats)

Aggregations

RowGroupMetadata (org.apache.drill.metastore.metadata.RowGroupMetadata)24 SchemaPath (org.apache.drill.common.expression.SchemaPath)18 Path (org.apache.hadoop.fs.Path)16 FileMetadata (org.apache.drill.metastore.metadata.FileMetadata)15 BaseTableMetadata (org.apache.drill.metastore.metadata.BaseTableMetadata)13 TableInfo (org.apache.drill.metastore.metadata.TableInfo)13 MetastoreTest (org.apache.drill.categories.MetastoreTest)12 MetastoreTableInfo (org.apache.drill.metastore.components.tables.MetastoreTableInfo)12 Test (org.junit.Test)12 SlowTest (org.apache.drill.categories.SlowTest)11 SegmentMetadata (org.apache.drill.metastore.metadata.SegmentMetadata)11 ClusterTest (org.apache.drill.test.ClusterTest)11 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)11 File (java.io.File)10 ColumnStatistics (org.apache.drill.metastore.statistics.ColumnStatistics)9 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 TupleMetadata (org.apache.drill.exec.record.metadata.TupleMetadata)7 MetadataType (org.apache.drill.metastore.metadata.MetadataType)7 List (java.util.List)6