use of org.apache.drill.metastore.metadata.FileMetadata in project drill by apache.
the class TestMetastoreCommands method testIncrementalAnalyzeRemovedParentSegment.
@Test
public void testIncrementalAnalyzeRemovedParentSegment() throws Exception {
String tableName = "multilevel/parquetRemovedParent";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
TableInfo tableInfo = getTableInfo(tableName, "tmp");
BaseTableMetadata expectedTableMetadata = getBaseTableMetadata(tableInfo, table);
try {
dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet", "1994"), Paths.get(tableName, "1993"));
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
List<SegmentMetadata> segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
assertEquals(20, segmentMetadata.size());
List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(16, filesMetadata.size());
List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, null, (String) null);
assertEquals(16, rowGroupsMetadata.size());
FileUtils.deleteQuietly(new File(table, "1993"));
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
assertEquals(15, segmentMetadata.size());
filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(12, filesMetadata.size());
rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, null, (String) null);
assertEquals(12, rowGroupsMetadata.size());
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
FileUtils.deleteQuietly(table);
}
}
use of org.apache.drill.metastore.metadata.FileMetadata in project drill by apache.
the class TestMetastoreCommands method testAnalyzeNonEmptyTableWithEmptyFile.
@Test
public void testAnalyzeNonEmptyTableWithEmptyFile() throws Exception {
String tableName = "parquet_with_empty_file";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("parquet", "empty", "simple"), Paths.get(tableName));
TableInfo tableInfo = getTableInfo(tableName, "tmp");
TupleMetadata schema = new SchemaBuilder().addNullable("id", TypeProtos.MinorType.BIGINT).addNullable("name", TypeProtos.MinorType.VARCHAR).build();
Map<SchemaPath, ColumnStatistics<?>> columnStatistics = ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder().put(SchemaPath.getSimplePath("name"), getColumnStatistics("Tom", "Tom", 1L, TypeProtos.MinorType.VARCHAR)).put(SchemaPath.getSimplePath("id"), getColumnStatistics(2L, 2L, 1L, TypeProtos.MinorType.BIGINT)).build();
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(schema).location(new Path(table.toURI().getPath())).columnsStatistics(columnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(1L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
try {
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
MetastoreTableInfo metastoreTableInfo = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().metastoreTableInfo(tableInfo);
assertTrue("table metadata wasn't found", metastoreTableInfo.isExists());
BaseTableMetadata tableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, tableMetadata);
List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(2, filesMetadata.size());
List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
assertEquals(2, rowGroupsMetadata.size());
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
}
}
use of org.apache.drill.metastore.metadata.FileMetadata in project drill by apache.
the class BaseParquetMetadataProvider method getPartitionsMetadata.
@Override
public List<PartitionMetadata> getPartitionsMetadata() {
if (partitions == null) {
partitions = new ArrayList<>();
if (collectMetadata) {
Table<SchemaPath, Object, List<FileMetadata>> colValFile = HashBasedTable.create();
Collection<FileMetadata> filesMetadata = getFilesMetadataMap().values();
partitionColumns = getParquetGroupScanStatistics().getPartitionColumns();
for (FileMetadata fileMetadata : filesMetadata) {
for (SchemaPath partitionColumn : partitionColumns) {
Object partitionValue = getParquetGroupScanStatistics().getPartitionValue(fileMetadata.getPath(), partitionColumn);
// Table cannot contain nulls
partitionValue = partitionValue == null ? NULL_VALUE : partitionValue;
List<FileMetadata> partitionFiles = colValFile.get(partitionColumn, partitionValue);
if (partitionFiles == null) {
partitionFiles = new ArrayList<>();
colValFile.put(partitionColumn, partitionValue, partitionFiles);
}
partitionFiles.add(fileMetadata);
}
}
for (SchemaPath logicalExpressions : colValFile.rowKeySet()) {
for (List<FileMetadata> partValues : colValFile.row(logicalExpressions).values()) {
partitions.add(ParquetTableMetadataUtils.getPartitionMetadata(logicalExpressions, partValues));
}
}
} else {
for (SchemaPath partitionColumn : getParquetGroupScanStatistics().getPartitionColumns()) {
Map<Path, Object> partitionPaths = getParquetGroupScanStatistics().getPartitionPaths(partitionColumn);
Multimap<Object, Path> partitionsForValue = HashMultimap.create();
partitionPaths.forEach((path, value) -> partitionsForValue.put(value, path));
partitionsForValue.asMap().forEach((partitionKey, value) -> {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
List<StatisticsHolder<?>> statistics = new ArrayList<>();
partitionKey = partitionKey == NULL_VALUE ? null : partitionKey;
statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MIN_VALUE));
statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MAX_VALUE));
statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, TableStatisticsKind.ROW_COUNT));
columnsStatistics.put(partitionColumn, new ColumnStatistics<>(statistics, getParquetGroupScanStatistics().getTypeForColumn(partitionColumn).getMinorType()));
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).build();
TableMetadata tableMetadata = getTableMetadata();
PartitionMetadata partitionMetadata = PartitionMetadata.builder().tableInfo(tableMetadata.getTableInfo()).metadataInfo(metadataInfo).column(partitionColumn).schema(tableMetadata.getSchema()).columnsStatistics(columnsStatistics).metadataStatistics(statistics).partitionValues(Collections.emptyList()).locations(new HashSet<>(value)).build();
partitions.add(partitionMetadata);
});
}
}
}
return partitions;
}
Aggregations