Search in sources :

Example 26 with ColumnStatistics

use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.

the class TestMetastoreCommands method testAnalyzeNonEmptyTableWithEmptyFile.

@Test
public void testAnalyzeNonEmptyTableWithEmptyFile() throws Exception {
    String tableName = "parquet_with_empty_file";
    File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("parquet", "empty", "simple"), Paths.get(tableName));
    TableInfo tableInfo = getTableInfo(tableName, "tmp");
    TupleMetadata schema = new SchemaBuilder().addNullable("id", TypeProtos.MinorType.BIGINT).addNullable("name", TypeProtos.MinorType.VARCHAR).build();
    Map<SchemaPath, ColumnStatistics<?>> columnStatistics = ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder().put(SchemaPath.getSimplePath("name"), getColumnStatistics("Tom", "Tom", 1L, TypeProtos.MinorType.VARCHAR)).put(SchemaPath.getSimplePath("id"), getColumnStatistics(2L, 2L, 1L, TypeProtos.MinorType.BIGINT)).build();
    BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(schema).location(new Path(table.toURI().getPath())).columnsStatistics(columnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(1L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
    try {
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        MetastoreTableInfo metastoreTableInfo = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().metastoreTableInfo(tableInfo);
        assertTrue("table metadata wasn't found", metastoreTableInfo.isExists());
        BaseTableMetadata tableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
        assertEquals(expectedTableMetadata, tableMetadata);
        List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
        assertEquals(2, filesMetadata.size());
        List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
        assertEquals(2, rowGroupsMetadata.size());
    } finally {
        run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
    }
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) SchemaBuilder(org.apache.drill.exec.record.metadata.SchemaBuilder) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) File(java.io.File) ClusterTest(org.apache.drill.test.ClusterTest) SlowTest(org.apache.drill.categories.SlowTest) MetastoreTest(org.apache.drill.categories.MetastoreTest) Test(org.junit.Test)

Example 27 with ColumnStatistics

use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.

the class TestMetastoreCommands method testIncrementalAnalyzeNewChildSegment.

@Test
public void testIncrementalAnalyzeNewChildSegment() throws Exception {
    String tableName = "multilevel/parquetNewChildSegment";
    File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
    Path tablePath = new Path(table.toURI().getPath());
    TableInfo tableInfo = getTableInfo(tableName, "tmp");
    // updates statistics values due to new segment
    Map<SchemaPath, ColumnStatistics<?>> updatedStatistics = new HashMap<>(TABLE_COLUMN_STATISTICS);
    updatedStatistics.replaceAll((logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Arrays.asList(new StatisticsHolder<>(130L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(130L, ColumnStatisticsKind.NON_NULL_VALUES_COUNT)))));
    updatedStatistics.computeIfPresent(SchemaPath.getSimplePath("dir1"), (logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Collections.singletonList(new StatisticsHolder<>("Q5", ColumnStatisticsKind.MAX_VALUE)))));
    BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(updatedStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(130L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
    try {
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        List<SegmentMetadata> segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
        assertEquals(15, segmentMetadata.size());
        dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel", "parquet", "1994", "Q4"), Paths.get(tableName, "1994", "Q5"));
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
        assertEquals(expectedTableMetadata, actualTableMetadata);
        segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
        assertEquals(16, segmentMetadata.size());
    } finally {
        run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
        FileUtils.deleteQuietly(table);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) HashMap(java.util.HashMap) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) File(java.io.File) ClusterTest(org.apache.drill.test.ClusterTest) SlowTest(org.apache.drill.categories.SlowTest) MetastoreTest(org.apache.drill.categories.MetastoreTest) Test(org.junit.Test)

Example 28 with ColumnStatistics

use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.

the class TestMetastoreCommands method testIncrementalAnalyzeNewParentSegment.

@Test
public void testIncrementalAnalyzeNewParentSegment() throws Exception {
    String tableName = "multilevel/parquetNewParentSegment";
    File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
    Path tablePath = new Path(table.toURI().getPath());
    TableInfo tableInfo = getTableInfo(tableName, "tmp");
    // updates statistics values due to new segment
    Map<SchemaPath, ColumnStatistics<?>> updatedStatistics = new HashMap<>(TABLE_COLUMN_STATISTICS);
    updatedStatistics.replaceAll((logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Arrays.asList(new StatisticsHolder<>(160L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(160L, ColumnStatisticsKind.NON_NULL_VALUES_COUNT)))));
    updatedStatistics.computeIfPresent(SchemaPath.getSimplePath("dir0"), (logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Collections.singletonList(new StatisticsHolder<>("1993", ColumnStatisticsKind.MIN_VALUE)))));
    BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(updatedStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(160L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
    try {
        assertEquals(0, cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null).size());
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        List<SegmentMetadata> segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
        assertEquals(15, segmentMetadata.size());
        dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet", "1994"), Paths.get(tableName, "1993"));
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
        assertEquals(expectedTableMetadata, actualTableMetadata);
        segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
        assertEquals(20, segmentMetadata.size());
    } finally {
        run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
        FileUtils.deleteQuietly(table);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) HashMap(java.util.HashMap) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) File(java.io.File) ClusterTest(org.apache.drill.test.ClusterTest) SlowTest(org.apache.drill.categories.SlowTest) MetastoreTest(org.apache.drill.categories.MetastoreTest) Test(org.junit.Test)

Example 29 with ColumnStatistics

use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.

the class TestMetastoreCommands method testAnalyzeWithColumns.

@Test
public void testAnalyzeWithColumns() throws Exception {
    String tableName = "multilevel/parquetColumns";
    File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
    Path tablePath = new Path(table.toURI().getPath());
    TableInfo tableInfo = getTableInfo(tableName, "tmp");
    Map<SchemaPath, ColumnStatistics<?>> updatedTableColumnStatistics = new HashMap<>();
    SchemaPath orderStatusPath = SchemaPath.getSimplePath("o_orderstatus");
    SchemaPath dir0Path = SchemaPath.getSimplePath("dir0");
    SchemaPath dir1Path = SchemaPath.getSimplePath("dir1");
    updatedTableColumnStatistics.put(orderStatusPath, TABLE_COLUMN_STATISTICS.get(orderStatusPath));
    updatedTableColumnStatistics.put(dir0Path, TABLE_COLUMN_STATISTICS.get(dir0Path));
    updatedTableColumnStatistics.put(dir1Path, TABLE_COLUMN_STATISTICS.get(dir1Path));
    BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(updatedTableColumnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ROW_GROUP, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).interestingColumns(Collections.singletonList(orderStatusPath)).build();
    try {
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` columns(o_orderstatus) REFRESH METADATA 'row_group' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
        assertEquals(expectedTableMetadata, actualTableMetadata);
    } finally {
        run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) HashMap(java.util.HashMap) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) File(java.io.File) ClusterTest(org.apache.drill.test.ClusterTest) SlowTest(org.apache.drill.categories.SlowTest) MetastoreTest(org.apache.drill.categories.MetastoreTest) Test(org.junit.Test)

Example 30 with ColumnStatistics

use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.

the class MetastoreFileTableMetadataProvider method getNonInterestingColumnsMetadata.

@Override
public NonInterestingColumnsMetadata getNonInterestingColumnsMetadata() {
    throwIfChanged();
    if (nonInterestingColumnsMetadata == null) {
        TupleMetadata schema = getTableMetadata().getSchema();
        List<StatisticsHolder<?>> statistics = Collections.singletonList(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
        List<SchemaPath> columnPaths = SchemaUtil.getSchemaPaths(schema);
        List<SchemaPath> interestingColumns = getInterestingColumns(columnPaths);
        // populates statistics for non-interesting columns and columns for which statistics wasn't collected
        Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = columnPaths.stream().filter(schemaPath -> !interestingColumns.contains(schemaPath) || SchemaPathUtils.getColumnMetadata(schemaPath, schema).isArray()).collect(Collectors.toMap(Function.identity(), schemaPath -> new ColumnStatistics<>(statistics, SchemaPathUtils.getColumnMetadata(schemaPath, schema).type())));
        nonInterestingColumnsMetadata = new NonInterestingColumnsMetadata(columnsStatistics);
    }
    return nonInterestingColumnsMetadata;
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) TableInfo(org.apache.drill.metastore.metadata.TableInfo) LoggerFactory(org.slf4j.LoggerFactory) TableMetadataProvider(org.apache.drill.metastore.metadata.TableMetadataProvider) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) Function(java.util.function.Function) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) MetastoreMetadataProviderManager(org.apache.drill.exec.metastore.MetastoreMetadataProviderManager) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) NonInterestingColumnsMetadata(org.apache.drill.metastore.metadata.NonInterestingColumnsMetadata) DrillStatsTable(org.apache.drill.exec.planner.common.DrillStatsTable) TableMetadata(org.apache.drill.metastore.metadata.TableMetadata) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) BasicTablesRequests(org.apache.drill.metastore.components.tables.BasicTablesRequests) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) ColumnStatisticsKind(org.apache.drill.metastore.statistics.ColumnStatisticsKind) SchemaPathUtils(org.apache.drill.metastore.util.SchemaPathUtils) Logger(org.slf4j.Logger) SchemaProvider(org.apache.drill.exec.record.metadata.schema.SchemaProvider) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) IOException(java.io.IOException) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) Collectors(java.util.stream.Collectors) DrillFileSystemUtil(org.apache.drill.exec.util.DrillFileSystemUtil) List(java.util.List) TableMetadataProviderBuilder(org.apache.drill.metastore.metadata.TableMetadataProviderBuilder) Statistic(org.apache.drill.metastore.statistics.Statistic) ParquetTableMetadataUtils(org.apache.drill.exec.store.parquet.ParquetTableMetadataUtils) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) MetadataException(org.apache.drill.exec.exception.MetadataException) SchemaUtil(org.apache.drill.exec.record.SchemaUtil) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) Collections(java.util.Collections) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) NonInterestingColumnsMetadata(org.apache.drill.metastore.metadata.NonInterestingColumnsMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata)

Aggregations

ColumnStatistics (org.apache.drill.metastore.statistics.ColumnStatistics)40 SchemaPath (org.apache.drill.common.expression.SchemaPath)39 Path (org.apache.hadoop.fs.Path)30 BaseTableMetadata (org.apache.drill.metastore.metadata.BaseTableMetadata)29 StatisticsHolder (org.apache.drill.metastore.statistics.StatisticsHolder)27 MetastoreTableInfo (org.apache.drill.metastore.components.tables.MetastoreTableInfo)26 TableInfo (org.apache.drill.metastore.metadata.TableInfo)26 HashMap (java.util.HashMap)25 MetastoreTest (org.apache.drill.categories.MetastoreTest)21 ClusterTest (org.apache.drill.test.ClusterTest)21 Test (org.junit.Test)21 File (java.io.File)20 SlowTest (org.apache.drill.categories.SlowTest)20 TupleMetadata (org.apache.drill.exec.record.metadata.TupleMetadata)20 FileMetadata (org.apache.drill.metastore.metadata.FileMetadata)17 ArrayList (java.util.ArrayList)15 SegmentMetadata (org.apache.drill.metastore.metadata.SegmentMetadata)14 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)14 RowGroupMetadata (org.apache.drill.metastore.metadata.RowGroupMetadata)12 Map (java.util.Map)11