use of org.apache.drill.metastore.statistics.BaseStatisticsKind in project drill by apache.
the class ParquetTableMetadataUtils method getRowGroupMetadata.
/**
* Returns {@link RowGroupMetadata} instance converted from specified parquet {@code rowGroupMetadata}.
*
* @param tableMetadata table metadata which contains row group metadata to convert
* @param rowGroupMetadata row group metadata to convert
* @param rgIndexInFile index of current row group within the file
* @param location location of file with current row group
* @return {@link RowGroupMetadata} instance converted from specified parquet {@code rowGroupMetadata}
*/
public static RowGroupMetadata getRowGroupMetadata(MetadataBase.ParquetTableMetadataBase tableMetadata, MetadataBase.RowGroupMetadata rowGroupMetadata, int rgIndexInFile, Path location) {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = getRowGroupColumnStatistics(tableMetadata, rowGroupMetadata);
List<StatisticsHolder<?>> rowGroupStatistics = new ArrayList<>();
rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getRowCount(), TableStatisticsKind.ROW_COUNT));
rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getStart(), new BaseStatisticsKind<>(ExactStatisticsConstants.START, true)));
rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getLength(), new BaseStatisticsKind<>(ExactStatisticsConstants.LENGTH, true)));
Map<SchemaPath, TypeProtos.MajorType> columns = getRowGroupFields(tableMetadata, rowGroupMetadata);
Map<SchemaPath, TypeProtos.MajorType> intermediateColumns = getIntermediateFields(tableMetadata, rowGroupMetadata);
TupleMetadata schema = new TupleSchema();
columns.forEach((schemaPath, majorType) -> SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateColumns));
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.ROW_GROUP).build();
return RowGroupMetadata.builder().tableInfo(TableInfo.UNKNOWN_TABLE_INFO).metadataInfo(metadataInfo).schema(schema).columnsStatistics(columnsStatistics).metadataStatistics(rowGroupStatistics).hostAffinity(rowGroupMetadata.getHostAffinity()).rowGroupIndex(rgIndexInFile).path(location).build();
}
use of org.apache.drill.metastore.statistics.BaseStatisticsKind in project drill by apache.
the class TestMetastoreCommands method testSimpleAnalyze.
@Test
public void testSimpleAnalyze() throws Exception {
String tableName = "multilevel/parquetSimpleAnalyze";
TableInfo tableInfo = getTableInfo(tableName, "default");
File table = dirTestWatcher.copyResourceToRoot(Paths.get("multilevel/parquet"), Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
BaseTableMetadata expectedTableMetadata = getBaseTableMetadata(tableInfo, table);
TableInfo baseTableInfo = TableInfo.builder().name(tableName).storagePlugin("dfs").workspace("default").build();
SegmentMetadata dir0 = SegmentMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.SEGMENT).identifier("1994").key("1994").build()).path(new Path(tablePath, "1994")).schema(SCHEMA).lastModifiedTime(getMaxLastModified(new File(table, "1994"))).column(SchemaPath.getSimplePath("dir0")).columnsStatistics(DIR0_1994_SEGMENT_COLUMN_STATISTICS).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(40L, TableStatisticsKind.ROW_COUNT))).locations(ImmutableSet.of(new Path(tablePath, "1994/Q1/orders_94_q1.parquet"), new Path(tablePath, "1994/Q2/orders_94_q2.parquet"), new Path(tablePath, "1994/Q3/orders_94_q3.parquet"), new Path(tablePath, "1994/Q4/orders_94_q4.parquet"))).partitionValues(Collections.singletonList("1994")).build();
Set<Path> expectedTopLevelSegmentLocations = ImmutableSet.of(new Path(tablePath, "1994"), new Path(tablePath, "1995"), new Path(tablePath, "1996"));
Set<Set<Path>> expectedSegmentFilesLocations = new HashSet<>();
Set<Path> segmentFiles = ImmutableSet.of(new Path(tablePath, "1994/Q2/orders_94_q2.parquet"), new Path(tablePath, "1994/Q4/orders_94_q4.parquet"), new Path(tablePath, "1994/Q1/orders_94_q1.parquet"), new Path(tablePath, "1994/Q3/orders_94_q3.parquet"));
expectedSegmentFilesLocations.add(segmentFiles);
segmentFiles = ImmutableSet.of(new Path(tablePath, "1995/Q2/orders_95_q2.parquet"), new Path(tablePath, "1995/Q4/orders_95_q4.parquet"), new Path(tablePath, "1995/Q1/orders_95_q1.parquet"), new Path(tablePath, "1995/Q3/orders_95_q3.parquet"));
expectedSegmentFilesLocations.add(segmentFiles);
segmentFiles = ImmutableSet.of(new Path(tablePath, "1996/Q3/orders_96_q3.parquet"), new Path(tablePath, "1996/Q2/orders_96_q2.parquet"), new Path(tablePath, "1996/Q4/orders_96_q4.parquet"), new Path(tablePath, "1996/Q1/orders_96_q1.parquet"));
expectedSegmentFilesLocations.add(segmentFiles);
long dir0q1lastModified = new File(new File(new File(table, "1994"), "Q1"), "orders_94_q1.parquet").lastModified();
FileMetadata dir01994q1File = FileMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.FILE).identifier("1994/Q1/orders_94_q1.parquet").key("1994").build()).schema(SCHEMA).lastModifiedTime(dir0q1lastModified).columnsStatistics(DIR0_1994_Q1_SEGMENT_COLUMN_STATISTICS).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(10L, TableStatisticsKind.ROW_COUNT))).path(new Path(tablePath, "1994/Q1/orders_94_q1.parquet")).build();
RowGroupMetadata dir01994q1rowGroup = RowGroupMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.ROW_GROUP).identifier("1994/Q1/orders_94_q1.parquet/0").key("1994").build()).schema(SCHEMA).rowGroupIndex(0).hostAffinity(Collections.emptyMap()).lastModifiedTime(dir0q1lastModified).columnsStatistics(DIR0_1994_Q1_SEGMENT_COLUMN_STATISTICS).metadataStatistics(Arrays.asList(new StatisticsHolder<>(10L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(1196L, new BaseStatisticsKind<>(ExactStatisticsConstants.LENGTH, true)), new StatisticsHolder<>(4L, new BaseStatisticsKind<>(ExactStatisticsConstants.START, true)))).path(new Path(tablePath, "1994/Q1/orders_94_q1.parquet")).build();
try {
testBuilder().sqlQuery("ANALYZE TABLE dfs.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.default.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, actualTableMetadata);
List<SegmentMetadata> topSegmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByColumn(tableInfo, null, "`dir0`");
SegmentMetadata actualDir0Metadata = topSegmentMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994")).findAny().orElseThrow(() -> new AssertionError("Segment is absent"));
Set<Path> locations = actualDir0Metadata.getLocations();
actualDir0Metadata.toBuilder().locations(locations);
assertEquals(dir0, actualDir0Metadata);
Set<Path> topLevelSegmentLocations = topSegmentMetadata.stream().map(SegmentMetadata::getLocation).collect(Collectors.toSet());
// verify top segments locations
assertEquals(expectedTopLevelSegmentLocations, topLevelSegmentLocations);
Set<Set<Path>> segmentFilesLocations = topSegmentMetadata.stream().map(SegmentMetadata::getLocations).collect(Collectors.toSet());
assertEquals(expectedSegmentFilesLocations, segmentFilesLocations);
// verify nested segments
List<SegmentMetadata> nestedSegmentMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().segmentsMetadataByColumn(tableInfo, null, "`dir1`");
assertEquals(12, nestedSegmentMetadata.size());
SegmentMetadata dir01994q1Segment = SegmentMetadata.builder().tableInfo(baseTableInfo).metadataInfo(MetadataInfo.builder().type(MetadataType.SEGMENT).identifier("1994/Q1").key("1994").build()).path(new Path(new Path(tablePath, "1994"), "Q1")).schema(SCHEMA).lastModifiedTime(getMaxLastModified(new File(new File(table, "1994"), "Q1"))).column(SchemaPath.getSimplePath("dir1")).columnsStatistics(DIR0_1994_Q1_SEGMENT_COLUMN_STATISTICS).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(10L, TableStatisticsKind.ROW_COUNT))).locations(ImmutableSet.of(new Path(tablePath, "1994/Q1/orders_94_q1.parquet"))).partitionValues(Collections.singletonList("Q1")).build();
// verify segment for 1994
assertEquals(dir01994q1Segment, nestedSegmentMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994/Q1")).findAny().orElse(null));
// verify files metadata
List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(12, filesMetadata.size());
// verify first file metadata
assertEquals(dir01994q1File, filesMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994/Q1/orders_94_q1.parquet")).findAny().orElse(null));
// verify row groups metadata
List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, null, (String) null);
assertEquals(12, rowGroupsMetadata.size());
// verify first row group dir01994q1rowGroup
assertEquals(dir01994q1rowGroup, rowGroupsMetadata.stream().filter(unit -> unit.getMetadataInfo().identifier().equals("1994/Q1/orders_94_q1.parquet/0")).findAny().orElse(null));
} finally {
run("analyze table dfs.`%s` drop metadata if exists", tableName);
}
}
use of org.apache.drill.metastore.statistics.BaseStatisticsKind in project drill by apache.
the class MetadataControllerBatch method getMetadataStatistics.
private List<StatisticsHolder<?>> getMetadataStatistics(TupleReader reader, TupleMetadata columnMetadata) {
List<StatisticsHolder<?>> metadataStatistics = new ArrayList<>();
String rgs = columnNamesOptions.rowGroupStart();
String rgl = columnNamesOptions.rowGroupLength();
for (ColumnMetadata column : columnMetadata) {
String columnName = column.name();
ObjectReader objectReader = reader.column(columnName);
if (AnalyzeColumnUtils.isMetadataStatisticsField(columnName)) {
metadataStatistics.add(new StatisticsHolder<>(objectReader.getObject(), AnalyzeColumnUtils.getStatisticsKind(columnName)));
} else if (!objectReader.isNull()) {
if (columnName.equals(rgs)) {
metadataStatistics.add(new StatisticsHolder<>(Long.parseLong(objectReader.scalar().getString()), new BaseStatisticsKind<>(ExactStatisticsConstants.START, true)));
} else if (columnName.equals(rgl)) {
metadataStatistics.add(new StatisticsHolder<>(Long.parseLong(objectReader.scalar().getString()), new BaseStatisticsKind<>(ExactStatisticsConstants.LENGTH, true)));
}
}
}
return metadataStatistics;
}
Aggregations