use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class TestTableMetadataUnitConversion method testSegmentMetadata.
@Test
public void testSegmentMetadata() {
TableInfo tableInfo = data.basicTableInfo;
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.SEGMENT).key("part_int=3").identifier("part_int=3/part_varchar=g").build();
Path path = new Path("/tmp/nation");
String unitPath = path.toUri().getPath();
Set<Path> locations = new HashSet<>();
locations.add(new Path("part_int=3/part_varchar=g/0_0_0.parquet"));
locations.add(new Path("part_int=3/part_varchar=g/0_0_1.parquet"));
List<String> unitLocations = locations.stream().map(location -> location.toUri().getPath()).collect(Collectors.toList());
// check required fields
SegmentMetadata requiredFieldsMetadata = SegmentMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(data.columnsStatistics).metadataStatistics(data.metadataStatistics).path(path).locations(locations).build();
TableMetadataUnit requiredFieldsExpectedUnit = TableMetadataUnit.builder().storagePlugin(tableInfo.storagePlugin()).workspace(tableInfo.workspace()).tableName(tableInfo.name()).metadataType(metadataInfo.type().name()).metadataKey(metadataInfo.key()).metadataIdentifier(metadataInfo.identifier()).columnsStatistics(data.unitColumnsStatistics).metadataStatistics(data.unitMetadataStatistics).lastModifiedTime(BaseMetadata.UNDEFINED_TIME).path(path.toUri().getPath()).location(unitPath).locations(unitLocations).build();
TableMetadataUnit requiredFieldsUnit = requiredFieldsMetadata.toMetadataUnit();
assertEquals(requiredFieldsExpectedUnit, requiredFieldsUnit);
assertNotNull(SegmentMetadata.builder().metadataUnit(requiredFieldsUnit).build());
SchemaPath column = SchemaPath.getSimplePath("dir1");
List<String> partitionValues = Collections.singletonList("part_varchar=g");
SegmentMetadata allFieldsMetadata = SegmentMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).schema(data.schema).columnsStatistics(data.columnsStatistics).metadataStatistics(data.metadataStatistics).lastModifiedTime(data.lastModifiedTime).path(path).locations(locations).column(column).partitionValues(partitionValues).build();
TableMetadataUnit allFieldsExpectedUnit = TableMetadataUnit.builder().storagePlugin(tableInfo.storagePlugin()).workspace(tableInfo.workspace()).tableName(tableInfo.name()).metadataType(metadataInfo.type().name()).metadataKey(metadataInfo.key()).metadataIdentifier(metadataInfo.identifier()).schema(data.unitSchema).columnsStatistics(data.unitColumnsStatistics).metadataStatistics(data.unitMetadataStatistics).lastModifiedTime(data.lastModifiedTime).path(path.toUri().getPath()).location(unitPath).locations(unitLocations).column(column.toString()).partitionValues(partitionValues).build();
TableMetadataUnit allFieldsUnit = allFieldsMetadata.toMetadataUnit();
assertEquals(allFieldsExpectedUnit, allFieldsUnit);
assertNotNull(SegmentMetadata.builder().metadataUnit(allFieldsUnit).build());
}
use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class TestMetastoreWithEasyFormatPlugin method testAnalyzeOnJsonTable.
@Test
public void testAnalyzeOnJsonTable() throws Exception {
String tableName = "multilevel/json";
TableInfo tableInfo = getTableInfo(tableName, "default", "json");
File table = dirTestWatcher.copyResourceToRoot(Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
TupleMetadata schema = new SchemaBuilder().addNullable("dir0", TypeProtos.MinorType.VARCHAR).addNullable("dir1", TypeProtos.MinorType.VARCHAR).addNullable("o_orderkey", TypeProtos.MinorType.BIGINT).addNullable("o_custkey", TypeProtos.MinorType.BIGINT).addNullable("o_orderstatus", TypeProtos.MinorType.VARCHAR).addNullable("o_totalprice", TypeProtos.MinorType.FLOAT8).addNullable("o_orderdate", TypeProtos.MinorType.VARCHAR).addNullable("o_orderpriority", TypeProtos.MinorType.VARCHAR).addNullable("o_clerk", TypeProtos.MinorType.VARCHAR).addNullable("o_shippriority", TypeProtos.MinorType.BIGINT).addNullable("o_comment", TypeProtos.MinorType.VARCHAR).build();
Map<SchemaPath, ColumnStatistics<?>> tableColumnStatistics = new HashMap<>(TABLE_COLUMN_STATISTICS);
tableColumnStatistics.put(SchemaPath.getSimplePath("o_custkey"), getColumnStatistics(25L, 1498L, 120L, TypeProtos.MinorType.BIGINT));
tableColumnStatistics.put(SchemaPath.getSimplePath("o_orderdate"), getColumnStatistics("1994-01-01T00:00:00.000-08:00", "1996-12-19T00:00:00.000-08:00", 120L, TypeProtos.MinorType.VARCHAR));
tableColumnStatistics.put(SchemaPath.getSimplePath("o_orderkey"), getColumnStatistics(1L, 1319L, 120L, TypeProtos.MinorType.BIGINT));
tableColumnStatistics.put(SchemaPath.getSimplePath("o_shippriority"), getColumnStatistics(0L, 0L, 120L, TypeProtos.MinorType.BIGINT));
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(schema).location(new Path(table.toURI().getPath())).columnsStatistics(tableColumnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
TableInfo baseTableInfo = TableInfo.builder().name(tableName).storagePlugin("dfs").workspace("default").build();
Map<SchemaPath, ColumnStatistics<?>> dir0CSVStats = new HashMap<>(DIR0_1994_SEGMENT_COLUMN_STATISTICS);
dir0CSVStats.put(SchemaPath.getSimplePath("o_custkey"), getColumnStatistics(25L, 1469L, 40L, TypeProtos.MinorType.BIGINT));
dir0CSVStats.put(SchemaPath.getSimplePath("o_orderdate"), getColumnStatistics("1994-01-01T00:00:00.000-08:00", "1994-12-23T00:00:00.000-08:00", 40L, TypeProtos.MinorType.VARCHAR));
dir0CSVStats.put(SchemaPath.getSimplePath("o_orderkey"), getColumnStatistics(5L, 1031L, 40L, TypeProtos.MinorType.BIGINT));
dir0CSVStats.put(SchemaPath.getSimplePath("o_shippriority"), getColumnStatistics(0L, 0L, 40L, TypeProtos.MinorType.BIGINT));
SegmentMetadata dir0 = SegmentMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.SEGMENT).identifier("1994").key("1994").build()).path(new Path(tablePath, "1994")).schema(schema).lastModifiedTime(getMaxLastModified(new File(table, "1994"))).column(SchemaPath.getSimplePath("dir0")).columnsStatistics(dir0CSVStats).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(40L, TableStatisticsKind.ROW_COUNT))).locations(ImmutableSet.of(new Path(tablePath, "1994/Q1/orders_94_q1.json"), new Path(tablePath, "1994/Q2/orders_94_q2.json"), new Path(tablePath, "1994/Q3/orders_94_q3.json"), new Path(tablePath, "1994/Q4/orders_94_q4.json"))).partitionValues(Collections.singletonList("1994")).build();
Set<Path> expectedTopLevelSegmentLocations = ImmutableSet.of(new Path(tablePath, "1994"), new Path(tablePath, "1995"), new Path(tablePath, "1996"));
Set<Set<Path>> expectedSegmentFilesLocations = new HashSet<>();
Set<Path> segmentFiles = ImmutableSet.of(new Path(tablePath, "1994/Q2/orders_94_q2.json"), new Path(tablePath, "1994/Q4/orders_94_q4.json"), new Path(tablePath, "1994/Q1/orders_94_q1.json"), new Path(tablePath, "1994/Q3/orders_94_q3.json"));
expectedSegmentFilesLocations.add(segmentFiles);
segmentFiles = ImmutableSet.of(new Path(tablePath, "1995/Q2/orders_95_q2.json"), new Path(tablePath, "1995/Q4/orders_95_q4.json"), new Path(tablePath, "1995/Q1/orders_95_q1.json"), new Path(tablePath, "1995/Q3/orders_95_q3.json"));
expectedSegmentFilesLocations.add(segmentFiles);
segmentFiles = ImmutableSet.of(new Path(tablePath, "1996/Q3/orders_96_q3.json"), new Path(tablePath, "1996/Q2/orders_96_q2.json"), new Path(tablePath, "1996/Q4/orders_96_q4.json"), new Path(tablePath, "1996/Q1/orders_96_q1.json"));
expectedSegmentFilesLocations.add(segmentFiles);
Map<SchemaPath, ColumnStatistics<?>> dir0q1Stats = new HashMap<>(DIR0_1994_Q1_SEGMENT_COLUMN_STATISTICS);
dir0q1Stats.put(SchemaPath.getSimplePath("o_custkey"), getColumnStatistics(392L, 1411L, 10L, TypeProtos.MinorType.BIGINT));
dir0q1Stats.put(SchemaPath.getSimplePath("o_orderdate"), getColumnStatistics("1994-01-01T00:00:00.000-08:00", "1994-03-26T00:00:00.000-08:00", 10L, TypeProtos.MinorType.VARCHAR));
dir0q1Stats.put(SchemaPath.getSimplePath("o_orderkey"), getColumnStatistics(66L, 833L, 10L, TypeProtos.MinorType.BIGINT));
dir0q1Stats.put(SchemaPath.getSimplePath("o_shippriority"), getColumnStatistics(0L, 0L, 10L, TypeProtos.MinorType.BIGINT));
long dir0q1lastModified = new File(new File(new File(table, "1994"), "Q1"), "orders_94_q1.json").lastModified();
FileMetadata dir01994q1File = FileMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.FILE).identifier("1994/Q1/orders_94_q1.json").key("1994").build()).schema(schema).lastModifiedTime(dir0q1lastModified).columnsStatistics(dir0q1Stats).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(10L, TableStatisticsKind.ROW_COUNT))).path(new Path(tablePath, "1994/Q1/orders_94_q1.json")).build();
try {
testBuilder().sqlQuery("analyze table table(dfs.`%s`(schema=>%s)) refresh metadata", tableName, SCHEMA_STRING).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.default.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
List<SegmentMetadata> topSegmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByColumn(tableInfo, null, "`dir0`");
SegmentMetadata actualDir0Metadata = topSegmentMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994")).findAny().orElseThrow(() -> new AssertionError("Segment is absent"));
Set<Path> locations = actualDir0Metadata.getLocations();
actualDir0Metadata.toBuilder().locations(locations);
assertEquals(dir0, actualDir0Metadata);
Set<Path> topLevelSegmentLocations = topSegmentMetadata.stream().map(SegmentMetadata::getLocation).collect(Collectors.toSet());
// verify top segments locations
assertEquals(expectedTopLevelSegmentLocations, topLevelSegmentLocations);
Set<Set<Path>> segmentFilesLocations = topSegmentMetadata.stream().map(SegmentMetadata::getLocations).collect(Collectors.toSet());
assertEquals(expectedSegmentFilesLocations, segmentFilesLocations);
// verify nested segments
List<SegmentMetadata> nestedSegmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByColumn(tableInfo, null, "`dir1`");
assertEquals(12, nestedSegmentMetadata.size());
SegmentMetadata dir01994q1Segment = SegmentMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.SEGMENT).identifier("1994/Q1").key("1994").build()).path(new Path(new Path(tablePath, "1994"), "Q1")).schema(schema).lastModifiedTime(getMaxLastModified(new File(new File(table, "1994"), "Q1"))).column(SchemaPath.getSimplePath("dir1")).columnsStatistics(dir0q1Stats).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(10L, TableStatisticsKind.ROW_COUNT))).locations(ImmutableSet.of(new Path(tablePath, "1994/Q1/orders_94_q1.json"))).partitionValues(Collections.singletonList("Q1")).build();
// verify segment for 1994
assertEquals(dir01994q1Segment, nestedSegmentMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994/Q1")).findAny().orElse(null));
// verify files metadata
List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(12, filesMetadata.size());
// verify first file metadata
assertEquals(dir01994q1File, filesMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994/Q1/orders_94_q1.json")).findAny().orElse(null));
} finally {
run("analyze table dfs.`%s` drop metadata if exists", tableName);
}
}
use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class MetadataControllerBatch method getTableMetadata.
private BaseTableMetadata getTableMetadata(TupleReader reader, List<StatisticsHolder<?>> metadataStatistics, Map<SchemaPath, ColumnStatistics<?>> columnStatistics) {
List<StatisticsHolder<?>> updatedMetaStats = new ArrayList<>(metadataStatistics);
updatedMetaStats.add(new StatisticsHolder<>(popConfig.getContext().analyzeMetadataLevel(), TableStatisticsKind.ANALYZE_METADATA_LEVEL));
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.TABLE).key(MetadataInfo.GENERAL_INFO_KEY).build();
BaseTableMetadata tableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(columnStatistics).metadataStatistics(updatedMetaStats).partitionKeys(Collections.emptyMap()).interestingColumns(popConfig.getContext().interestingColumns()).location(popConfig.getContext().location()).lastModifiedTime(Long.parseLong(reader.column(columnNamesOptions.lastModifiedTime()).scalar().getString())).schema(TupleMetadata.of(reader.column(MetastoreAnalyzeConstants.SCHEMA_FIELD).scalar().getString())).build();
if (context.getOptions().getOption(PlannerSettings.STATISTICS_USE)) {
DrillStatsTable statistics = new DrillStatsTable(statisticsCollector.getStatistics());
Map<SchemaPath, ColumnStatistics<?>> tableColumnStatistics = ParquetTableMetadataUtils.getColumnStatistics(tableMetadata.getSchema(), statistics);
tableMetadata = tableMetadata.cloneWithStats(tableColumnStatistics, DrillStatsTable.getEstimatedTableStats(statistics));
}
return tableMetadata;
}
use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class MetadataControllerBatch method getMetadataUnits.
private List<TableMetadataUnit> getMetadataUnits(VectorContainer container) {
List<TableMetadataUnit> metadataUnits = new ArrayList<>();
RowSetReader reader = DirectRowSet.fromContainer(container).reader();
while (reader.next()) {
metadataUnits.addAll(getMetadataUnits(reader, 0));
}
if (metadataToHandle != null) {
// leaves only table metadata and metadata which belongs to segments to be overridden
metadataUnits = metadataUnits.stream().filter(tableMetadataUnit -> metadataToHandle.values().stream().map(MetadataInfo::key).anyMatch(s -> s.equals(tableMetadataUnit.metadataKey())) || MetadataType.TABLE.name().equals(tableMetadataUnit.metadataType())).collect(Collectors.toList());
// leaves only metadata which should be fetched from the Metastore
metadataUnits.stream().map(TableMetadataUnit::metadataIdentifier).forEach(metadataToHandle::remove);
List<TableMetadataUnit> metadata = metadataToHandle.isEmpty() ? Collections.emptyList() : tables.basicRequests().metadata(popConfig.getContext().tableInfo(), metadataToHandle.values());
metadataUnits.addAll(metadata);
}
// checks whether metadataUnits contains not only table metadata before adding default segment
// to avoid case when only table metadata should be updated and / or root segments removed
boolean insertDefaultSegment = metadataUnits.size() > 1 && metadataUnits.stream().noneMatch(metadataUnit -> metadataUnit.metadataType().equals(MetadataType.SEGMENT.name()));
if (insertDefaultSegment) {
TableMetadataUnit defaultSegmentMetadata = getDefaultSegment(metadataUnits);
metadataUnits.add(defaultSegmentMetadata);
}
return metadataUnits;
}
use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class FileMetadataInfoCollector method init.
private void init(FormatSelection selection, PlannerSettings settings, Supplier<TableScan> tableScanSupplier, List<SchemaPath> interestingColumns, int segmentColumnsCount) throws IOException {
List<SchemaPath> metastoreInterestingColumns = Optional.ofNullable(basicRequests.interestingColumnsAndPartitionKeys(tableInfo).interestingColumns()).map(metastoreInterestingColumnNames -> metastoreInterestingColumnNames.stream().map(SchemaPath::parseFromString).collect(Collectors.toList())).orElse(null);
Map<String, Long> filesNamesLastModifiedTime = basicRequests.filesLastModifiedTime(tableInfo, null, null);
List<String> newFiles = new ArrayList<>();
List<String> updatedFiles = new ArrayList<>();
List<String> removedFiles = new ArrayList<>(filesNamesLastModifiedTime.keySet());
List<String> allFiles = new ArrayList<>();
for (FileStatus fileStatus : getFileStatuses(selection)) {
String path = Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath()).toUri().getPath();
Long lastModificationTime = filesNamesLastModifiedTime.get(path);
if (lastModificationTime == null) {
newFiles.add(path);
} else if (lastModificationTime < fileStatus.getModificationTime()) {
updatedFiles.add(path);
}
removedFiles.remove(path);
allFiles.add(path);
}
String selectionRoot = selection.getSelection().getSelectionRoot().toUri().getPath();
if (!Objects.equals(metastoreInterestingColumns, interestingColumns) && metastoreInterestingColumns != null && (interestingColumns == null || !metastoreInterestingColumns.containsAll(interestingColumns)) || TableStatisticsKind.ANALYZE_METADATA_LEVEL.getValue(basicRequests.tableMetadata(tableInfo)).compareTo(metadataLevel) != 0) {
// do not update table scan and lists of segments / files / row groups,
// metadata should be recalculated
tableScan = tableScanSupplier.get();
metadataToRemove.addAll(getMetadataInfoList(selectionRoot, removedFiles, MetadataType.SEGMENT, 0));
return;
}
// checks whether there are no new, updated and removed files
if (!newFiles.isEmpty() || !updatedFiles.isEmpty() || !removedFiles.isEmpty()) {
List<String> scanFiles = new ArrayList<>(newFiles);
scanFiles.addAll(updatedFiles);
// updates scan to read updated / new files
tableScan = getTableScan(settings, tableScanSupplier.get(), scanFiles);
// iterates from the end;
// takes deepest updated segments;
// finds their parents:
// - fetches all segments for parent level;
// - filters segments to leave parents only;
// obtains all child segments;
// filters child segments for filtered parent segments
int lastSegmentIndex = segmentColumnsCount - 1;
List<String> scanAndRemovedFiles = new ArrayList<>(scanFiles);
scanAndRemovedFiles.addAll(removedFiles);
// 1. Obtain files info for files from the same folder without removed files
// 2. Get segments for obtained files + segments for removed files
// 3. Get parent segments
// 4. Get other segments for the same parent segment
// 5. Remove segments which have only removed files (matched for removedFileInfo and don't match to filesInfo)
// 6. Do the same for parent segments
List<MetadataInfo> allFilesInfo = getMetadataInfoList(selectionRoot, allFiles, MetadataType.FILE, 0);
// first pass: collect updated segments even without files, they will be removed later
List<MetadataInfo> leafSegments = getMetadataInfoList(selectionRoot, scanAndRemovedFiles, MetadataType.SEGMENT, lastSegmentIndex);
List<MetadataInfo> removedFilesMetadata = getMetadataInfoList(selectionRoot, removedFiles, MetadataType.FILE, 0);
List<MetadataInfo> scanFilesInfo = getMetadataInfoList(selectionRoot, scanAndRemovedFiles, MetadataType.FILE, 0);
// files from scan + files from the same folder without removed files
filesInfo = leafSegments.stream().filter(parent -> scanFilesInfo.stream().anyMatch(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier()))).flatMap(parent -> allFilesInfo.stream().filter(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier()))).collect(Collectors.toList());
Multimap<Integer, MetadataInfo> allSegments = populateSegments(removedFiles, allFiles, selectionRoot, lastSegmentIndex, leafSegments, removedFilesMetadata);
List<MetadataInfo> allRowGroupsInfo = getAllRowGroupsMetadataInfos(allFiles);
rowGroupsInfo = allRowGroupsInfo.stream().filter(child -> filesInfo.stream().map(MetadataInfo::identifier).anyMatch(parent -> MetadataIdentifierUtils.isMetadataKeyParent(parent, child.identifier()))).collect(Collectors.toList());
List<MetadataInfo> segmentsToUpdate = getMetadataInfoList(selectionRoot, scanAndRemovedFiles, MetadataType.SEGMENT, 0);
allMetaToHandle = Streams.concat(allSegments.values().stream(), allFilesInfo.stream(), allRowGroupsInfo.stream()).filter(child -> segmentsToUpdate.stream().anyMatch(parent -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier()))).filter(parent -> removedFilesMetadata.stream().noneMatch(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier())) || filesInfo.stream().anyMatch(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier()))).collect(Collectors.toList());
// removed top-level segments are handled separately since their metadata is not overridden when producing writing to the Metastore
List<MetadataInfo> removedTopSegments = getMetadataInfoList(selectionRoot, removedFiles, MetadataType.SEGMENT, 0).stream().filter(parent -> removedFilesMetadata.stream().anyMatch(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier())) && allFilesInfo.stream().noneMatch(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier()))).collect(Collectors.toList());
metadataToRemove.addAll(removedTopSegments);
segmentsToUpdate.stream().filter(segment -> !removedTopSegments.contains(segment)).forEach(allMetaToHandle::add);
} else {
// table metadata may still be actual
outdated = false;
}
}
Aggregations