Search in sources :

Example 21 with FileMetadata

use of org.apache.drill.metastore.metadata.FileMetadata in project drill by apache.

the class TestMetastoreCommands method testIncrementalAnalyzeRemovedParentSegment.

@Test
public void testIncrementalAnalyzeRemovedParentSegment() throws Exception {
    String tableName = "multilevel/parquetRemovedParent";
    File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
    TableInfo tableInfo = getTableInfo(tableName, "tmp");
    BaseTableMetadata expectedTableMetadata = getBaseTableMetadata(tableInfo, table);
    try {
        dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet", "1994"), Paths.get(tableName, "1993"));
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        List<SegmentMetadata> segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
        assertEquals(20, segmentMetadata.size());
        List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
        assertEquals(16, filesMetadata.size());
        List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, null, (String) null);
        assertEquals(16, rowGroupsMetadata.size());
        FileUtils.deleteQuietly(new File(table, "1993"));
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
        assertEquals(expectedTableMetadata, actualTableMetadata);
        segmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByMetadataKey(tableInfo, null, null);
        assertEquals(15, segmentMetadata.size());
        filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
        assertEquals(12, filesMetadata.size());
        rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, null, (String) null);
        assertEquals(12, rowGroupsMetadata.size());
    } finally {
        run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
        FileUtils.deleteQuietly(table);
    }
}
Also used : SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) File(java.io.File) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) ClusterTest(org.apache.drill.test.ClusterTest) SlowTest(org.apache.drill.categories.SlowTest) MetastoreTest(org.apache.drill.categories.MetastoreTest) Test(org.junit.Test)

Example 22 with FileMetadata

use of org.apache.drill.metastore.metadata.FileMetadata in project drill by apache.

the class TestMetastoreCommands method testAnalyzeNonEmptyTableWithEmptyFile.

@Test
public void testAnalyzeNonEmptyTableWithEmptyFile() throws Exception {
    String tableName = "parquet_with_empty_file";
    File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("parquet", "empty", "simple"), Paths.get(tableName));
    TableInfo tableInfo = getTableInfo(tableName, "tmp");
    TupleMetadata schema = new SchemaBuilder().addNullable("id", TypeProtos.MinorType.BIGINT).addNullable("name", TypeProtos.MinorType.VARCHAR).build();
    Map<SchemaPath, ColumnStatistics<?>> columnStatistics = ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder().put(SchemaPath.getSimplePath("name"), getColumnStatistics("Tom", "Tom", 1L, TypeProtos.MinorType.VARCHAR)).put(SchemaPath.getSimplePath("id"), getColumnStatistics(2L, 2L, 1L, TypeProtos.MinorType.BIGINT)).build();
    BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(schema).location(new Path(table.toURI().getPath())).columnsStatistics(columnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(1L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
    try {
        testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
        MetastoreTableInfo metastoreTableInfo = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().metastoreTableInfo(tableInfo);
        assertTrue("table metadata wasn't found", metastoreTableInfo.isExists());
        BaseTableMetadata tableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
        assertEquals(expectedTableMetadata, tableMetadata);
        List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
        assertEquals(2, filesMetadata.size());
        List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
        assertEquals(2, rowGroupsMetadata.size());
    } finally {
        run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
    }
}
Also used : ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) SchemaBuilder(org.apache.drill.exec.record.metadata.SchemaBuilder) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreTableInfo(org.apache.drill.metastore.components.tables.MetastoreTableInfo) File(java.io.File) ClusterTest(org.apache.drill.test.ClusterTest) SlowTest(org.apache.drill.categories.SlowTest) MetastoreTest(org.apache.drill.categories.MetastoreTest) Test(org.junit.Test)

Example 23 with FileMetadata

use of org.apache.drill.metastore.metadata.FileMetadata in project drill by apache.

the class BaseParquetMetadataProvider method getPartitionsMetadata.

@Override
public List<PartitionMetadata> getPartitionsMetadata() {
    if (partitions == null) {
        partitions = new ArrayList<>();
        if (collectMetadata) {
            Table<SchemaPath, Object, List<FileMetadata>> colValFile = HashBasedTable.create();
            Collection<FileMetadata> filesMetadata = getFilesMetadataMap().values();
            partitionColumns = getParquetGroupScanStatistics().getPartitionColumns();
            for (FileMetadata fileMetadata : filesMetadata) {
                for (SchemaPath partitionColumn : partitionColumns) {
                    Object partitionValue = getParquetGroupScanStatistics().getPartitionValue(fileMetadata.getPath(), partitionColumn);
                    // Table cannot contain nulls
                    partitionValue = partitionValue == null ? NULL_VALUE : partitionValue;
                    List<FileMetadata> partitionFiles = colValFile.get(partitionColumn, partitionValue);
                    if (partitionFiles == null) {
                        partitionFiles = new ArrayList<>();
                        colValFile.put(partitionColumn, partitionValue, partitionFiles);
                    }
                    partitionFiles.add(fileMetadata);
                }
            }
            for (SchemaPath logicalExpressions : colValFile.rowKeySet()) {
                for (List<FileMetadata> partValues : colValFile.row(logicalExpressions).values()) {
                    partitions.add(ParquetTableMetadataUtils.getPartitionMetadata(logicalExpressions, partValues));
                }
            }
        } else {
            for (SchemaPath partitionColumn : getParquetGroupScanStatistics().getPartitionColumns()) {
                Map<Path, Object> partitionPaths = getParquetGroupScanStatistics().getPartitionPaths(partitionColumn);
                Multimap<Object, Path> partitionsForValue = HashMultimap.create();
                partitionPaths.forEach((path, value) -> partitionsForValue.put(value, path));
                partitionsForValue.asMap().forEach((partitionKey, value) -> {
                    Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
                    List<StatisticsHolder<?>> statistics = new ArrayList<>();
                    partitionKey = partitionKey == NULL_VALUE ? null : partitionKey;
                    statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MIN_VALUE));
                    statistics.add(new StatisticsHolder<>(partitionKey, ColumnStatisticsKind.MAX_VALUE));
                    statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, ColumnStatisticsKind.NULLS_COUNT));
                    statistics.add(new StatisticsHolder<>(Statistic.NO_COLUMN_STATS, TableStatisticsKind.ROW_COUNT));
                    columnsStatistics.put(partitionColumn, new ColumnStatistics<>(statistics, getParquetGroupScanStatistics().getTypeForColumn(partitionColumn).getMinorType()));
                    MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).build();
                    TableMetadata tableMetadata = getTableMetadata();
                    PartitionMetadata partitionMetadata = PartitionMetadata.builder().tableInfo(tableMetadata.getTableInfo()).metadataInfo(metadataInfo).column(partitionColumn).schema(tableMetadata.getSchema()).columnsStatistics(columnsStatistics).metadataStatistics(statistics).partitionValues(Collections.emptyList()).locations(new HashSet<>(value)).build();
                    partitions.add(partitionMetadata);
                });
            }
        }
    }
    return partitions;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ColumnStatistics(org.apache.drill.metastore.statistics.ColumnStatistics) BaseTableMetadata(org.apache.drill.metastore.metadata.BaseTableMetadata) TableMetadata(org.apache.drill.metastore.metadata.TableMetadata) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) ArrayList(java.util.ArrayList) StatisticsHolder(org.apache.drill.metastore.statistics.StatisticsHolder) SchemaPath(org.apache.drill.common.expression.SchemaPath) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) List(java.util.List) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet)

Aggregations

FileMetadata (org.apache.drill.metastore.metadata.FileMetadata)23 MetastoreTest (org.apache.drill.categories.MetastoreTest)17 SchemaPath (org.apache.drill.common.expression.SchemaPath)17 BaseTableMetadata (org.apache.drill.metastore.metadata.BaseTableMetadata)17 Path (org.apache.hadoop.fs.Path)17 Test (org.junit.Test)17 TableInfo (org.apache.drill.metastore.metadata.TableInfo)16 MetastoreTableInfo (org.apache.drill.metastore.components.tables.MetastoreTableInfo)15 File (java.io.File)14 SlowTest (org.apache.drill.categories.SlowTest)14 ClusterTest (org.apache.drill.test.ClusterTest)14 ColumnStatistics (org.apache.drill.metastore.statistics.ColumnStatistics)13 RowGroupMetadata (org.apache.drill.metastore.metadata.RowGroupMetadata)12 SegmentMetadata (org.apache.drill.metastore.metadata.SegmentMetadata)12 TupleMetadata (org.apache.drill.exec.record.metadata.TupleMetadata)9 CoreMatchers.containsString (org.hamcrest.CoreMatchers.containsString)9 HashMap (java.util.HashMap)8 SchemaBuilder (org.apache.drill.exec.record.metadata.SchemaBuilder)8 StatisticsHolder (org.apache.drill.metastore.statistics.StatisticsHolder)8 MetadataInfo (org.apache.drill.metastore.metadata.MetadataInfo)7