use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class TestMetastoreCommands method testIncrementalAnalyzeWithDifferentMetadataLevel.
@Test
public void testIncrementalAnalyzeWithDifferentMetadataLevel() throws Exception {
String tableName = "multilevel/parquetDifferentMetadataLevel";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
TableInfo tableInfo = getTableInfo(tableName, "tmp");
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(TABLE_COLUMN_STATISTICS).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.FILE, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
try {
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA 'file' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
List<RowGroupMetadata> rowGroupMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
assertEquals(expectedTableMetadata, actualTableMetadata);
assertTrue(rowGroupMetadata.isEmpty());
// checks that analyze was produced since metadata level more specific
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA 'row_group' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(TABLE_COLUMN_STATISTICS).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ROW_GROUP, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
assertEquals(expectedTableMetadata, actualTableMetadata);
rowGroupMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
assertEquals(12, rowGroupMetadata.size());
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
}
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class TestMetastoreCommands method testIncrementalAnalyzeNewChildSegment.
@Test
public void testIncrementalAnalyzeNewChildSegment() throws Exception {
String tableName = "multilevel/parquetNewChildSegment";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
TableInfo tableInfo = getTableInfo(tableName, "tmp");
// updates statistics values due to new segment
Map<SchemaPath, ColumnStatistics<?>> updatedStatistics = new HashMap<>(TABLE_COLUMN_STATISTICS);
updatedStatistics.replaceAll((logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Arrays.asList(new StatisticsHolder<>(130L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(130L, ColumnStatisticsKind.NON_NULL_VALUES_COUNT)))));
updatedStatistics.computeIfPresent(SchemaPath.getSimplePath("dir1"), (logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Collections.singletonList(new StatisticsHolder<>("Q5", ColumnStatisticsKind.MAX_VALUE)))));
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(updatedStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(130L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
try {
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
List<SegmentMetadata> segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
assertEquals(15, segmentMetadata.size());
dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel", "parquet", "1994", "Q4"), Paths.get(tableName, "1994", "Q5"));
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
assertEquals(16, segmentMetadata.size());
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
FileUtils.deleteQuietly(table);
}
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class TestMetastoreCommands method testIncrementalAnalyzeNewParentSegment.
@Test
public void testIncrementalAnalyzeNewParentSegment() throws Exception {
String tableName = "multilevel/parquetNewParentSegment";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
TableInfo tableInfo = getTableInfo(tableName, "tmp");
// updates statistics values due to new segment
Map<SchemaPath, ColumnStatistics<?>> updatedStatistics = new HashMap<>(TABLE_COLUMN_STATISTICS);
updatedStatistics.replaceAll((logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Arrays.asList(new StatisticsHolder<>(160L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(160L, ColumnStatisticsKind.NON_NULL_VALUES_COUNT)))));
updatedStatistics.computeIfPresent(SchemaPath.getSimplePath("dir0"), (logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Collections.singletonList(new StatisticsHolder<>("1993", ColumnStatisticsKind.MIN_VALUE)))));
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(updatedStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(160L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
try {
assertEquals(0, cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null).size());
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
List<SegmentMetadata> segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
assertEquals(15, segmentMetadata.size());
dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet", "1994"), Paths.get(tableName, "1993"));
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
assertEquals(20, segmentMetadata.size());
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
FileUtils.deleteQuietly(table);
}
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class TestMetastoreCommands method testAnalyzeWithColumns.
@Test
public void testAnalyzeWithColumns() throws Exception {
String tableName = "multilevel/parquetColumns";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
TableInfo tableInfo = getTableInfo(tableName, "tmp");
Map<SchemaPath, ColumnStatistics<?>> updatedTableColumnStatistics = new HashMap<>();
SchemaPath orderStatusPath = SchemaPath.getSimplePath("o_orderstatus");
SchemaPath dir0Path = SchemaPath.getSimplePath("dir0");
SchemaPath dir1Path = SchemaPath.getSimplePath("dir1");
updatedTableColumnStatistics.put(orderStatusPath, TABLE_COLUMN_STATISTICS.get(orderStatusPath));
updatedTableColumnStatistics.put(dir0Path, TABLE_COLUMN_STATISTICS.get(dir0Path));
updatedTableColumnStatistics.put(dir1Path, TABLE_COLUMN_STATISTICS.get(dir1Path));
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(updatedTableColumnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ROW_GROUP, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).interestingColumns(Collections.singletonList(orderStatusPath)).build();
try {
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` columns(o_orderstatus) REFRESH METADATA 'row_group' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
}
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class MetastoreFileTableMetadataProvider method getNonInterestingColumnsMetadata.
@Override
public NonInterestingColumnsMetadata getNonInterestingColumnsMetadata() {
throwIfChanged();
if (nonInterestingColumnsMetadata == null) {
TupleMetadata schema = getTableMetadata().getSchema();
List<StatisticsHolder<?>> statistics = Collections.singletonList(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
List<SchemaPath> columnPaths = SchemaUtil.getSchemaPaths(schema);
List<SchemaPath> interestingColumns = getInterestingColumns(columnPaths);
// populates statistics for non-interesting columns and columns for which statistics wasn't collected
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = columnPaths.stream().filter(schemaPath -> !interestingColumns.contains(schemaPath) || SchemaPathUtils.getColumnMetadata(schemaPath, schema).isArray()).collect(Collectors.toMap(Function.identity(), schemaPath -> new ColumnStatistics<>(statistics, SchemaPathUtils.getColumnMetadata(schemaPath, schema).type())));
nonInterestingColumnsMetadata = new NonInterestingColumnsMetadata(columnsStatistics);
}
return nonInterestingColumnsMetadata;
}
Aggregations