use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class TableMetadataUtils method mergeColumnsStatistics.
/**
* Merges list of specified metadata into the map of {@link ColumnStatistics} with columns as keys.
*
* @param <T> type of metadata to collect
* @param metadataList list of metadata to be merged
* @param columns set of columns whose statistics should be merged
* @param statisticsToCollect kinds of statistics that should be collected
* @return list of merged metadata
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public static <T extends BaseMetadata> Map<SchemaPath, ColumnStatistics<?>> mergeColumnsStatistics(Collection<T> metadataList, Set<SchemaPath> columns, List<CollectableColumnStatisticsKind<?>> statisticsToCollect) {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
for (SchemaPath column : columns) {
List<ColumnStatistics<?>> statisticsList = new ArrayList<>();
for (T metadata : metadataList) {
ColumnStatistics<?> statistics = metadata.getColumnsStatistics().get(column);
if (statistics == null) {
// schema change happened, set statistics which represents all nulls
statistics = new ColumnStatistics(Collections.singletonList(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.getValue(metadata), ColumnStatisticsKind.NULLS_COUNT)));
}
statisticsList.add(statistics);
}
List<StatisticsHolder<?>> statisticsHolders = new ArrayList<>();
for (CollectableColumnStatisticsKind<?> statisticsKind : statisticsToCollect) {
Object mergedStatistic = statisticsKind.mergeStatistics(statisticsList);
statisticsHolders.add(new StatisticsHolder<>(mergedStatistic, statisticsKind));
}
Iterator<ColumnStatistics<?>> iterator = statisticsList.iterator();
// Use INT if statistics wasn't provided
TypeProtos.MinorType comparatorType = iterator.hasNext() ? iterator.next().getComparatorType() : TypeProtos.MinorType.INT;
columnsStatistics.put(column, new ColumnStatistics<>(statisticsHolders, comparatorType));
}
return columnsStatistics;
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class TestMetastoreWithEasyFormatPlugin method testAnalyzeOnJsonTable.
@Test
public void testAnalyzeOnJsonTable() throws Exception {
String tableName = "multilevel/json";
TableInfo tableInfo = getTableInfo(tableName, "default", "json");
File table = dirTestWatcher.copyResourceToRoot(Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
TupleMetadata schema = new SchemaBuilder().addNullable("dir0", TypeProtos.MinorType.VARCHAR).addNullable("dir1", TypeProtos.MinorType.VARCHAR).addNullable("o_orderkey", TypeProtos.MinorType.BIGINT).addNullable("o_custkey", TypeProtos.MinorType.BIGINT).addNullable("o_orderstatus", TypeProtos.MinorType.VARCHAR).addNullable("o_totalprice", TypeProtos.MinorType.FLOAT8).addNullable("o_orderdate", TypeProtos.MinorType.VARCHAR).addNullable("o_orderpriority", TypeProtos.MinorType.VARCHAR).addNullable("o_clerk", TypeProtos.MinorType.VARCHAR).addNullable("o_shippriority", TypeProtos.MinorType.BIGINT).addNullable("o_comment", TypeProtos.MinorType.VARCHAR).build();
Map<SchemaPath, ColumnStatistics<?>> tableColumnStatistics = new HashMap<>(TABLE_COLUMN_STATISTICS);
tableColumnStatistics.put(SchemaPath.getSimplePath("o_custkey"), getColumnStatistics(25L, 1498L, 120L, TypeProtos.MinorType.BIGINT));
tableColumnStatistics.put(SchemaPath.getSimplePath("o_orderdate"), getColumnStatistics("1994-01-01T00:00:00.000-08:00", "1996-12-19T00:00:00.000-08:00", 120L, TypeProtos.MinorType.VARCHAR));
tableColumnStatistics.put(SchemaPath.getSimplePath("o_orderkey"), getColumnStatistics(1L, 1319L, 120L, TypeProtos.MinorType.BIGINT));
tableColumnStatistics.put(SchemaPath.getSimplePath("o_shippriority"), getColumnStatistics(0L, 0L, 120L, TypeProtos.MinorType.BIGINT));
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(schema).location(new Path(table.toURI().getPath())).columnsStatistics(tableColumnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
TableInfo baseTableInfo = TableInfo.builder().name(tableName).storagePlugin("dfs").workspace("default").build();
Map<SchemaPath, ColumnStatistics<?>> dir0CSVStats = new HashMap<>(DIR0_1994_SEGMENT_COLUMN_STATISTICS);
dir0CSVStats.put(SchemaPath.getSimplePath("o_custkey"), getColumnStatistics(25L, 1469L, 40L, TypeProtos.MinorType.BIGINT));
dir0CSVStats.put(SchemaPath.getSimplePath("o_orderdate"), getColumnStatistics("1994-01-01T00:00:00.000-08:00", "1994-12-23T00:00:00.000-08:00", 40L, TypeProtos.MinorType.VARCHAR));
dir0CSVStats.put(SchemaPath.getSimplePath("o_orderkey"), getColumnStatistics(5L, 1031L, 40L, TypeProtos.MinorType.BIGINT));
dir0CSVStats.put(SchemaPath.getSimplePath("o_shippriority"), getColumnStatistics(0L, 0L, 40L, TypeProtos.MinorType.BIGINT));
SegmentMetadata dir0 = SegmentMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.SEGMENT).identifier("1994").key("1994").build()).path(new Path(tablePath, "1994")).schema(schema).lastModifiedTime(getMaxLastModified(new File(table, "1994"))).column(SchemaPath.getSimplePath("dir0")).columnsStatistics(dir0CSVStats).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(40L, TableStatisticsKind.ROW_COUNT))).locations(ImmutableSet.of(new Path(tablePath, "1994/Q1/orders_94_q1.json"), new Path(tablePath, "1994/Q2/orders_94_q2.json"), new Path(tablePath, "1994/Q3/orders_94_q3.json"), new Path(tablePath, "1994/Q4/orders_94_q4.json"))).partitionValues(Collections.singletonList("1994")).build();
Set<Path> expectedTopLevelSegmentLocations = ImmutableSet.of(new Path(tablePath, "1994"), new Path(tablePath, "1995"), new Path(tablePath, "1996"));
Set<Set<Path>> expectedSegmentFilesLocations = new HashSet<>();
Set<Path> segmentFiles = ImmutableSet.of(new Path(tablePath, "1994/Q2/orders_94_q2.json"), new Path(tablePath, "1994/Q4/orders_94_q4.json"), new Path(tablePath, "1994/Q1/orders_94_q1.json"), new Path(tablePath, "1994/Q3/orders_94_q3.json"));
expectedSegmentFilesLocations.add(segmentFiles);
segmentFiles = ImmutableSet.of(new Path(tablePath, "1995/Q2/orders_95_q2.json"), new Path(tablePath, "1995/Q4/orders_95_q4.json"), new Path(tablePath, "1995/Q1/orders_95_q1.json"), new Path(tablePath, "1995/Q3/orders_95_q3.json"));
expectedSegmentFilesLocations.add(segmentFiles);
segmentFiles = ImmutableSet.of(new Path(tablePath, "1996/Q3/orders_96_q3.json"), new Path(tablePath, "1996/Q2/orders_96_q2.json"), new Path(tablePath, "1996/Q4/orders_96_q4.json"), new Path(tablePath, "1996/Q1/orders_96_q1.json"));
expectedSegmentFilesLocations.add(segmentFiles);
Map<SchemaPath, ColumnStatistics<?>> dir0q1Stats = new HashMap<>(DIR0_1994_Q1_SEGMENT_COLUMN_STATISTICS);
dir0q1Stats.put(SchemaPath.getSimplePath("o_custkey"), getColumnStatistics(392L, 1411L, 10L, TypeProtos.MinorType.BIGINT));
dir0q1Stats.put(SchemaPath.getSimplePath("o_orderdate"), getColumnStatistics("1994-01-01T00:00:00.000-08:00", "1994-03-26T00:00:00.000-08:00", 10L, TypeProtos.MinorType.VARCHAR));
dir0q1Stats.put(SchemaPath.getSimplePath("o_orderkey"), getColumnStatistics(66L, 833L, 10L, TypeProtos.MinorType.BIGINT));
dir0q1Stats.put(SchemaPath.getSimplePath("o_shippriority"), getColumnStatistics(0L, 0L, 10L, TypeProtos.MinorType.BIGINT));
long dir0q1lastModified = new File(new File(new File(table, "1994"), "Q1"), "orders_94_q1.json").lastModified();
FileMetadata dir01994q1File = FileMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.FILE).identifier("1994/Q1/orders_94_q1.json").key("1994").build()).schema(schema).lastModifiedTime(dir0q1lastModified).columnsStatistics(dir0q1Stats).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(10L, TableStatisticsKind.ROW_COUNT))).path(new Path(tablePath, "1994/Q1/orders_94_q1.json")).build();
try {
testBuilder().sqlQuery("analyze table table(dfs.`%s`(schema=>%s)) refresh metadata", tableName, SCHEMA_STRING).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.default.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
List<SegmentMetadata> topSegmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByColumn(tableInfo, null, "`dir0`");
SegmentMetadata actualDir0Metadata = topSegmentMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994")).findAny().orElseThrow(() -> new AssertionError("Segment is absent"));
Set<Path> locations = actualDir0Metadata.getLocations();
actualDir0Metadata.toBuilder().locations(locations);
assertEquals(dir0, actualDir0Metadata);
Set<Path> topLevelSegmentLocations = topSegmentMetadata.stream().map(SegmentMetadata::getLocation).collect(Collectors.toSet());
// verify top segments locations
assertEquals(expectedTopLevelSegmentLocations, topLevelSegmentLocations);
Set<Set<Path>> segmentFilesLocations = topSegmentMetadata.stream().map(SegmentMetadata::getLocations).collect(Collectors.toSet());
assertEquals(expectedSegmentFilesLocations, segmentFilesLocations);
// verify nested segments
List<SegmentMetadata> nestedSegmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByColumn(tableInfo, null, "`dir1`");
assertEquals(12, nestedSegmentMetadata.size());
SegmentMetadata dir01994q1Segment = SegmentMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.SEGMENT).identifier("1994/Q1").key("1994").build()).path(new Path(new Path(tablePath, "1994"), "Q1")).schema(schema).lastModifiedTime(getMaxLastModified(new File(new File(table, "1994"), "Q1"))).column(SchemaPath.getSimplePath("dir1")).columnsStatistics(dir0q1Stats).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(10L, TableStatisticsKind.ROW_COUNT))).locations(ImmutableSet.of(new Path(tablePath, "1994/Q1/orders_94_q1.json"))).partitionValues(Collections.singletonList("Q1")).build();
// verify segment for 1994
assertEquals(dir01994q1Segment, nestedSegmentMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994/Q1")).findAny().orElse(null));
// verify files metadata
List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(12, filesMetadata.size());
// verify first file metadata
assertEquals(dir01994q1File, filesMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994/Q1/orders_94_q1.json")).findAny().orElse(null));
} finally {
run("analyze table dfs.`%s` drop metadata if exists", tableName);
}
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class TestInfoSchemaWithMetastore method testTableWithStats.
@Test
public void testTableWithStats() throws Exception {
ZonedDateTime currentTime = currentUtcTime();
String tableName = "table_with_stats";
BaseTableMetadata table = BaseTableMetadata.builder().tableInfo(TableInfo.builder().storagePlugin("dfs").workspace("tmp").name(tableName).type("PARQUET").build()).metadataInfo(MetadataInfo.builder().type(MetadataType.TABLE).key(MetadataInfo.GENERAL_INFO_KEY).build()).location(new Path("/tmp", tableName)).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(100L, TableStatisticsKind.ROW_COUNT))).columnsStatistics(Collections.emptyMap()).partitionKeys(Collections.emptyMap()).lastModifiedTime(currentTime.toInstant().toEpochMilli()).build();
metastore.tables().modify().overwrite(table.toMetadataUnit()).execute();
client.testBuilder().sqlQuery("select %s from information_schema.`tables` where table_name = '%s'", String.join(", ", TABLES_COLUMNS), tableName).unOrdered().baselineColumns(TABLES_COLUMNS.toArray(new String[0])).baselineValues("DRILL", "dfs.tmp", tableName, "TABLE", table.getTableInfo().type(), table.getLocation().toUri().toString(), 100L, currentTime.toLocalDateTime()).go();
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class TestMetastoreCommands method testIncrementalAnalyzeUpdatedFile.
@Test
public void testIncrementalAnalyzeUpdatedFile() throws Exception {
String tableName = "multilevel/parquetUpdatedFile";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
TableInfo tableInfo = getTableInfo(tableName, "tmp");
try {
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
List<SegmentMetadata> segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
assertEquals(15, segmentMetadata.size());
List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(12, filesMetadata.size());
List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, null, (String) null);
assertEquals(12, rowGroupsMetadata.size());
File fileToUpdate = new File(new File(new File(table, "1994"), "Q4"), "orders_94_q4.parquet");
long lastModified = fileToUpdate.lastModified();
FileUtils.deleteQuietly(fileToUpdate);
// replaces original file
dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel", "parquet", "1994", "Q1", "orders_94_q1.parquet"), Paths.get(tableName, "1994", "Q4", "orders_94_q4.parquet"));
long newLastModified = lastModified + 1000;
assertTrue(fileToUpdate.setLastModified(newLastModified));
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
Map<SchemaPath, ColumnStatistics<?>> tableColumnStatistics = new HashMap<>(TABLE_COLUMN_STATISTICS);
tableColumnStatistics.computeIfPresent(SchemaPath.getSimplePath("o_clerk"), (logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Collections.singletonList(new StatisticsHolder<>("Clerk#000000006", ColumnStatisticsKind.MIN_VALUE)))));
tableColumnStatistics.computeIfPresent(SchemaPath.getSimplePath("o_totalprice"), (logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Collections.singletonList(new StatisticsHolder<>(328207.15, ColumnStatisticsKind.MAX_VALUE)))));
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(new Path(table.toURI().getPath())).columnsStatistics(tableColumnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(newLastModified).build();
assertEquals(expectedTableMetadata, actualTableMetadata);
segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
assertEquals(15, segmentMetadata.size());
filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(12, filesMetadata.size());
rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, null, (String) null);
assertEquals(12, rowGroupsMetadata.size());
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
FileUtils.deleteQuietly(table);
}
}
use of org.apache.drill.metastore.statistics.StatisticsHolder in project drill by apache.
the class TestMetastoreCommands method testIncrementalAnalyzeWithFewerColumns.
@Test
public void testIncrementalAnalyzeWithFewerColumns() throws Exception {
String tableName = "multilevel/parquetFewerColumns";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
TableInfo tableInfo = getTableInfo(tableName, "tmp");
Map<SchemaPath, ColumnStatistics<?>> updatedTableColumnStatistics = new HashMap<>();
SchemaPath orderStatusPath = SchemaPath.getSimplePath("o_orderstatus");
SchemaPath orderDatePath = SchemaPath.getSimplePath("o_orderdate");
SchemaPath dir0Path = SchemaPath.getSimplePath("dir0");
SchemaPath dir1Path = SchemaPath.getSimplePath("dir1");
updatedTableColumnStatistics.put(orderStatusPath, TABLE_COLUMN_STATISTICS.get(orderStatusPath));
updatedTableColumnStatistics.put(orderDatePath, TABLE_COLUMN_STATISTICS.get(orderDatePath));
updatedTableColumnStatistics.put(dir0Path, TABLE_COLUMN_STATISTICS.get(dir0Path));
updatedTableColumnStatistics.put(dir1Path, TABLE_COLUMN_STATISTICS.get(dir1Path));
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(updatedTableColumnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ROW_GROUP, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).interestingColumns(Arrays.asList(orderStatusPath, orderDatePath)).build();
try {
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` columns(o_orderstatus, o_orderdate) REFRESH METADATA 'row_group' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
// checks that analyze wasn't produced though interesting columns list differs, but it is a sublist of previously analyzed table
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` columns(o_orderstatus) REFRESH METADATA 'row_group' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(false, "Table metadata is up to date, analyze wasn't performed.").go();
actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
}
}
Aggregations