use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class TestMetastoreCommands method testAnalyzeWithNoColumns.
@Test
public void testAnalyzeWithNoColumns() throws Exception {
String tableName = "multilevel/parquetNoColumns";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
TableInfo tableInfo = getTableInfo(tableName, "tmp");
Map<SchemaPath, ColumnStatistics<?>> updatedTableColumnStatistics = new HashMap<>();
SchemaPath dir0Path = SchemaPath.getSimplePath("dir0");
SchemaPath dir1Path = SchemaPath.getSimplePath("dir1");
updatedTableColumnStatistics.put(dir0Path, TABLE_COLUMN_STATISTICS.get(dir0Path));
updatedTableColumnStatistics.put(dir1Path, TABLE_COLUMN_STATISTICS.get(dir1Path));
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(updatedTableColumnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ROW_GROUP, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).interestingColumns(Collections.emptyList()).build();
try {
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` columns NONE REFRESH METADATA 'row_group' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
}
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class TestMetastoreWithEasyFormatPlugin method testAnalyzeOnTextTable.
@Test
public void testAnalyzeOnTextTable() throws Exception {
String tableName = "multilevel/csv";
TableInfo tableInfo = getTableInfo(tableName, "default", "csv");
File table = dirTestWatcher.copyResourceToRoot(Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
BaseTableMetadata expectedTableMetadata = getBaseTableMetadata(tableInfo, table, SCHEMA);
TableInfo baseTableInfo = TableInfo.builder().name(tableName).storagePlugin("dfs").workspace("default").build();
Map<SchemaPath, ColumnStatistics<?>> dir0CSVStats = new HashMap<>(DIR0_1994_SEGMENT_COLUMN_STATISTICS);
dir0CSVStats.put(SchemaPath.getSimplePath("o_comment"), getColumnStatistics(" accounts nag slyly. ironic", "yly final requests over the furiously regula", 40L, TypeProtos.MinorType.VARCHAR));
SegmentMetadata dir0 = SegmentMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.SEGMENT).identifier("1994").key("1994").build()).path(new Path(tablePath, "1994")).schema(SCHEMA).lastModifiedTime(getMaxLastModified(new File(table, "1994"))).column(SchemaPath.getSimplePath("dir0")).columnsStatistics(dir0CSVStats).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(40L, TableStatisticsKind.ROW_COUNT))).locations(ImmutableSet.of(new Path(tablePath, "1994/Q1/orders_94_q1.csv"), new Path(tablePath, "1994/Q2/orders_94_q2.csv"), new Path(tablePath, "1994/Q3/orders_94_q3.csv"), new Path(tablePath, "1994/Q4/orders_94_q4.csv"))).partitionValues(Collections.singletonList("1994")).build();
Set<Path> expectedTopLevelSegmentLocations = ImmutableSet.of(new Path(tablePath, "1994"), new Path(tablePath, "1995"), new Path(tablePath, "1996"));
Set<Set<Path>> expectedSegmentFilesLocations = new HashSet<>();
Set<Path> segmentFiles = ImmutableSet.of(new Path(tablePath, "1994/Q2/orders_94_q2.csv"), new Path(tablePath, "1994/Q4/orders_94_q4.csv"), new Path(tablePath, "1994/Q1/orders_94_q1.csv"), new Path(tablePath, "1994/Q3/orders_94_q3.csv"));
expectedSegmentFilesLocations.add(segmentFiles);
segmentFiles = ImmutableSet.of(new Path(tablePath, "1995/Q2/orders_95_q2.csv"), new Path(tablePath, "1995/Q4/orders_95_q4.csv"), new Path(tablePath, "1995/Q1/orders_95_q1.csv"), new Path(tablePath, "1995/Q3/orders_95_q3.csv"));
expectedSegmentFilesLocations.add(segmentFiles);
segmentFiles = ImmutableSet.of(new Path(tablePath, "1996/Q3/orders_96_q3.csv"), new Path(tablePath, "1996/Q2/orders_96_q2.csv"), new Path(tablePath, "1996/Q4/orders_96_q4.csv"), new Path(tablePath, "1996/Q1/orders_96_q1.csv"));
expectedSegmentFilesLocations.add(segmentFiles);
long dir0q1lastModified = new File(new File(new File(table, "1994"), "Q1"), "orders_94_q1.csv").lastModified();
FileMetadata dir01994q1File = FileMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.FILE).identifier("1994/Q1/orders_94_q1.csv").key("1994").build()).schema(SCHEMA).lastModifiedTime(dir0q1lastModified).columnsStatistics(DIR0_1994_Q1_SEGMENT_COLUMN_STATISTICS).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(10L, TableStatisticsKind.ROW_COUNT))).path(new Path(tablePath, "1994/Q1/orders_94_q1.csv")).build();
try {
testBuilder().sqlQuery("analyze table table(dfs.`%s`(schema=>%s)) refresh metadata", tableName, SCHEMA_STRING).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.default.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
List<SegmentMetadata> topSegmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByColumn(tableInfo, null, "`dir0`");
SegmentMetadata actualDir0Metadata = topSegmentMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994")).findAny().orElseThrow(() -> new AssertionError("Segment is absent"));
Set<Path> locations = actualDir0Metadata.getLocations();
actualDir0Metadata.toBuilder().locations(locations);
assertEquals(dir0, actualDir0Metadata);
Set<Path> topLevelSegmentLocations = topSegmentMetadata.stream().map(SegmentMetadata::getLocation).collect(Collectors.toSet());
// verify top segments locations
assertEquals(expectedTopLevelSegmentLocations, topLevelSegmentLocations);
Set<Set<Path>> segmentFilesLocations = topSegmentMetadata.stream().map(SegmentMetadata::getLocations).collect(Collectors.toSet());
assertEquals(expectedSegmentFilesLocations, segmentFilesLocations);
// verify nested segments
List<SegmentMetadata> nestedSegmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByColumn(tableInfo, null, "`dir1`");
assertEquals(12, nestedSegmentMetadata.size());
SegmentMetadata dir01994q1Segment = SegmentMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.SEGMENT).identifier("1994/Q1").key("1994").build()).path(new Path(new Path(tablePath, "1994"), "Q1")).schema(SCHEMA).lastModifiedTime(getMaxLastModified(new File(new File(table, "1994"), "Q1"))).column(SchemaPath.getSimplePath("dir1")).columnsStatistics(DIR0_1994_Q1_SEGMENT_COLUMN_STATISTICS).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(10L, TableStatisticsKind.ROW_COUNT))).locations(ImmutableSet.of(new Path(tablePath, "1994/Q1/orders_94_q1.csv"))).partitionValues(Collections.singletonList("Q1")).build();
// verify segment for 1994
assertEquals(dir01994q1Segment, nestedSegmentMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994/Q1")).findAny().orElse(null));
// verify files metadata
List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(12, filesMetadata.size());
// verify first file metadata
assertEquals(dir01994q1File, filesMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994/Q1/orders_94_q1.csv")).findAny().orElse(null));
} finally {
run("analyze table dfs.`%s` drop metadata if exists", tableName);
}
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class TestMetastoreWithEasyFormatPlugin method testIncrementalAnalyzeNewFile.
@Test
public void testIncrementalAnalyzeNewFile() throws Exception {
String tableName = "multilevel/csvNewFile";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/csv"), Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
TableInfo tableInfo = getTableInfo(tableName, "tmp", "csv");
// updates statistics values due to new segment
Map<SchemaPath, ColumnStatistics<?>> updatedStatistics = new HashMap<>(TABLE_COLUMN_STATISTICS);
updatedStatistics.replaceAll((logicalExpressions, columnStatistics) -> columnStatistics.cloneWith(new ColumnStatistics<>(Arrays.asList(new StatisticsHolder<>(130L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(130L, ColumnStatisticsKind.NON_NULL_VALUES_COUNT)))));
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(updatedStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(130L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
try {
testBuilder().sqlQuery("ANALYZE TABLE table(dfs.tmp.`%s` (schema=>%s)) REFRESH METADATA", tableName, SCHEMA_STRING).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
List<SegmentMetadata> segmentsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
assertEquals(15, segmentsMetadata.size());
List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(12, filesMetadata.size());
dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel", "csv", "1994", "Q4", "orders_94_q4.csv"), Paths.get(tableName, "1994", "Q4", "orders_94_q4_1.csv"));
testBuilder().sqlQuery("ANALYZE TABLE table(dfs.tmp.`%s` (schema=>%s)) REFRESH METADATA", tableName, SCHEMA_STRING).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
segmentsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
// verifies that segments count left unchanged
assertEquals(15, segmentsMetadata.size());
filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(13, filesMetadata.size());
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
FileUtils.deleteQuietly(table);
}
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class StatisticsProvider method visitFunctionHolderExpression.
@Override
@SuppressWarnings("unchecked")
public ColumnStatistics<?> visitFunctionHolderExpression(FunctionHolderExpression holderExpr, Void value) {
FuncHolder funcHolder = holderExpr.getHolder();
if (!(funcHolder instanceof DrillSimpleFuncHolder)) {
// Only Drill function is allowed.
return null;
}
String funcName = ((DrillSimpleFuncHolder) funcHolder).getRegisteredNames()[0];
if (FunctionReplacementUtils.isCastFunction(funcName)) {
ColumnStatistics<T> stat = (ColumnStatistics<T>) holderExpr.args.get(0).accept(this, null);
if (!IsPredicate.isNullOrEmpty(stat)) {
return evalCastFunc(holderExpr, stat);
}
}
return null;
}
use of org.apache.drill.metastore.statistics.ColumnStatistics in project drill by apache.
the class TableMetadataUtils method mergeColumnsStatistics.
/**
* Merges list of specified metadata into the map of {@link ColumnStatistics} with columns as keys.
*
* @param <T> type of metadata to collect
* @param metadataList list of metadata to be merged
* @param columns set of columns whose statistics should be merged
* @param statisticsToCollect kinds of statistics that should be collected
* @return list of merged metadata
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public static <T extends BaseMetadata> Map<SchemaPath, ColumnStatistics<?>> mergeColumnsStatistics(Collection<T> metadataList, Set<SchemaPath> columns, List<CollectableColumnStatisticsKind<?>> statisticsToCollect) {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
for (SchemaPath column : columns) {
List<ColumnStatistics<?>> statisticsList = new ArrayList<>();
for (T metadata : metadataList) {
ColumnStatistics<?> statistics = metadata.getColumnsStatistics().get(column);
if (statistics == null) {
// schema change happened, set statistics which represents all nulls
statistics = new ColumnStatistics(Collections.singletonList(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.getValue(metadata), ColumnStatisticsKind.NULLS_COUNT)));
}
statisticsList.add(statistics);
}
List<StatisticsHolder<?>> statisticsHolders = new ArrayList<>();
for (CollectableColumnStatisticsKind<?> statisticsKind : statisticsToCollect) {
Object mergedStatistic = statisticsKind.mergeStatistics(statisticsList);
statisticsHolders.add(new StatisticsHolder<>(mergedStatistic, statisticsKind));
}
Iterator<ColumnStatistics<?>> iterator = statisticsList.iterator();
// Use INT if statistics wasn't provided
TypeProtos.MinorType comparatorType = iterator.hasNext() ? iterator.next().getComparatorType() : TypeProtos.MinorType.INT;
columnsStatistics.put(column, new ColumnStatistics<>(statisticsHolders, comparatorType));
}
return columnsStatistics;
}
Aggregations