use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.
the class TestMetastoreCommands method testIncrementalAnalyzeWithDifferentMetadataLevel.
@Test
public void testIncrementalAnalyzeWithDifferentMetadataLevel() throws Exception {
String tableName = "multilevel/parquetDifferentMetadataLevel";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("multilevel/parquet"), Paths.get(tableName));
Path tablePath = new Path(table.toURI().getPath());
TableInfo tableInfo = getTableInfo(tableName, "tmp");
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(TABLE_COLUMN_STATISTICS).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.FILE, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
try {
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA 'file' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
BaseTableMetadata actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
List<RowGroupMetadata> rowGroupMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
assertEquals(expectedTableMetadata, actualTableMetadata);
assertTrue(rowGroupMetadata.isEmpty());
// checks that analyze was produced since metadata level more specific
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA 'row_group' LEVEL", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
actualTableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(SCHEMA).location(tablePath).columnsStatistics(TABLE_COLUMN_STATISTICS).metadataStatistics(Arrays.asList(new StatisticsHolder<>(120L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ROW_GROUP, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
assertEquals(expectedTableMetadata, actualTableMetadata);
rowGroupMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
assertEquals(12, rowGroupMetadata.size());
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
}
}
use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.
the class TestMetastoreCommands method testAnalyzeNonEmptyTableWithEmptyFile.
@Test
public void testAnalyzeNonEmptyTableWithEmptyFile() throws Exception {
String tableName = "parquet_with_empty_file";
File table = dirTestWatcher.copyResourceToTestTmp(Paths.get("parquet", "empty", "simple"), Paths.get(tableName));
TableInfo tableInfo = getTableInfo(tableName, "tmp");
TupleMetadata schema = new SchemaBuilder().addNullable("id", TypeProtos.MinorType.BIGINT).addNullable("name", TypeProtos.MinorType.VARCHAR).build();
Map<SchemaPath, ColumnStatistics<?>> columnStatistics = ImmutableMap.<SchemaPath, ColumnStatistics<?>>builder().put(SchemaPath.getSimplePath("name"), getColumnStatistics("Tom", "Tom", 1L, TypeProtos.MinorType.VARCHAR)).put(SchemaPath.getSimplePath("id"), getColumnStatistics(2L, 2L, 1L, TypeProtos.MinorType.BIGINT)).build();
BaseTableMetadata expectedTableMetadata = BaseTableMetadata.builder().tableInfo(tableInfo).metadataInfo(TABLE_META_INFO).schema(schema).location(new Path(table.toURI().getPath())).columnsStatistics(columnStatistics).metadataStatistics(Arrays.asList(new StatisticsHolder<>(1L, TableStatisticsKind.ROW_COUNT), new StatisticsHolder<>(MetadataType.ALL, TableStatisticsKind.ANALYZE_METADATA_LEVEL))).partitionKeys(Collections.emptyMap()).lastModifiedTime(getMaxLastModified(table)).build();
try {
testBuilder().sqlQuery("ANALYZE TABLE dfs.tmp.`%s` REFRESH METADATA", tableName).unOrdered().baselineColumns("ok", "summary").baselineValues(true, String.format("Collected / refreshed metadata for table [dfs.tmp.%s]", tableName)).go();
MetastoreTableInfo metastoreTableInfo = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().metastoreTableInfo(tableInfo);
assertTrue("table metadata wasn't found", metastoreTableInfo.isExists());
BaseTableMetadata tableMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().tableMetadata(tableInfo);
assertEquals(expectedTableMetadata, tableMetadata);
List<FileMetadata> filesMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().filesMetadata(tableInfo, null, null);
assertEquals(2, filesMetadata.size());
List<RowGroupMetadata> rowGroupsMetadata = cluster.drillbit().getContext().getMetastoreRegistry().get().tables().basicRequests().rowGroupsMetadata(tableInfo, (String) null, null);
assertEquals(2, rowGroupsMetadata.size());
} finally {
run("analyze table dfs.tmp.`%s` drop metadata if exists", tableName);
}
}
use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.
the class MetadataHandlerBatch method writeMetadata.
private <T extends BaseMetadata & LocationProvider> VectorContainer writeMetadata(List<T> metadataList) {
BaseMetadata firstElement = metadataList.iterator().next();
ResultSetLoader resultSetLoader = getResultSetLoaderForMetadata(firstElement);
resultSetLoader.startBatch();
RowSetLoader rowWriter = resultSetLoader.writer();
Iterator<T> segmentsIterator = metadataList.iterator();
while (!rowWriter.isFull() && segmentsIterator.hasNext()) {
T metadata = segmentsIterator.next();
metadataToHandle.remove(metadata.getMetadataInfo().identifier());
List<Object> arguments = new ArrayList<>();
// adds required segment names to the arguments
arguments.add(metadata.getPath().toUri().getPath());
Collections.addAll(arguments, Arrays.copyOf(MetadataIdentifierUtils.getValuesFromMetadataIdentifier(metadata.getMetadataInfo().identifier()), popConfig.getContext().segmentColumns().size()));
// adds column statistics values assuming that they are sorted in alphabetic order
// (see getResultSetLoaderForMetadata() method)
metadata.getColumnsStatistics().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().toExpr())).map(Map.Entry::getValue).flatMap(columnStatistics -> AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet().stream().map(columnStatistics::get)).forEach(arguments::add);
AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet().stream().map(metadata::getStatistic).forEach(arguments::add);
// collectedMap field value
arguments.add(new Object[] {});
if (metadataType == MetadataType.SEGMENT) {
arguments.add(((SegmentMetadata) metadata).getLocations().stream().map(path -> path.toUri().getPath()).toArray(String[]::new));
}
if (metadataType == MetadataType.ROW_GROUP) {
arguments.add(String.valueOf(((RowGroupMetadata) metadata).getRowGroupIndex()));
arguments.add(Long.toString(metadata.getStatistic(() -> ExactStatisticsConstants.START)));
arguments.add(Long.toString(metadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
}
arguments.add(metadata.getSchema().jsonString());
arguments.add(String.valueOf(metadata.getLastModifiedTime()));
arguments.add(metadataType.name());
rowWriter.addRow(arguments.toArray());
}
return resultSetLoader.harvest();
}
use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.
the class MetadataHandlerBatch method writeMetadataUsingBatchSchema.
private <T extends BaseMetadata & LocationProvider> VectorContainer writeMetadataUsingBatchSchema(List<T> metadataList) {
Preconditions.checkArgument(!metadataList.isEmpty(), "Metadata list shouldn't be empty.");
ResultSetLoader resultSetLoader = getResultSetLoaderWithBatchSchema();
resultSetLoader.startBatch();
RowSetLoader rowWriter = resultSetLoader.writer();
Iterator<T> segmentsIterator = metadataList.iterator();
while (!rowWriter.isFull() && segmentsIterator.hasNext()) {
T metadata = segmentsIterator.next();
metadataToHandle.remove(metadata.getMetadataInfo().identifier());
List<Object> arguments = new ArrayList<>();
for (VectorWrapper<?> vectorWrapper : container) {
String[] identifierValues = Arrays.copyOf(MetadataIdentifierUtils.getValuesFromMetadataIdentifier(metadata.getMetadataInfo().identifier()), popConfig.getContext().segmentColumns().size());
MaterializedField field = vectorWrapper.getField();
String fieldName = field.getName();
if (fieldName.equals(MetastoreAnalyzeConstants.LOCATION_FIELD)) {
arguments.add(metadata.getPath().toUri().getPath());
} else if (fieldName.equals(MetastoreAnalyzeConstants.LOCATIONS_FIELD)) {
if (metadataType == MetadataType.SEGMENT) {
arguments.add(((SegmentMetadata) metadata).getLocations().stream().map(path -> path.toUri().getPath()).toArray(String[]::new));
} else {
arguments.add(null);
}
} else if (popConfig.getContext().segmentColumns().contains(fieldName)) {
arguments.add(identifierValues[popConfig.getContext().segmentColumns().indexOf(fieldName)]);
} else if (AnalyzeColumnUtils.isColumnStatisticsField(fieldName)) {
arguments.add(metadata.getColumnStatistics(SchemaPath.parseFromString(AnalyzeColumnUtils.getColumnName(fieldName))).get(AnalyzeColumnUtils.getStatisticsKind(fieldName)));
} else if (AnalyzeColumnUtils.isMetadataStatisticsField(fieldName)) {
arguments.add(metadata.getStatistic(AnalyzeColumnUtils.getStatisticsKind(fieldName)));
} else if (fieldName.equals(MetastoreAnalyzeConstants.COLLECTED_MAP_FIELD)) {
// collectedMap field value
arguments.add(new Object[] {});
} else if (fieldName.equals(MetastoreAnalyzeConstants.SCHEMA_FIELD)) {
arguments.add(metadata.getSchema().jsonString());
} else if (fieldName.equals(columnNamesOptions.lastModifiedTime())) {
arguments.add(String.valueOf(metadata.getLastModifiedTime()));
} else if (fieldName.equals(columnNamesOptions.rowGroupIndex())) {
arguments.add(String.valueOf(((RowGroupMetadata) metadata).getRowGroupIndex()));
} else if (fieldName.equals(columnNamesOptions.rowGroupStart())) {
arguments.add(Long.toString(metadata.getStatistic(() -> ExactStatisticsConstants.START)));
} else if (fieldName.equals(columnNamesOptions.rowGroupLength())) {
arguments.add(Long.toString(metadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
} else if (fieldName.equals(MetastoreAnalyzeConstants.METADATA_TYPE)) {
arguments.add(metadataType.name());
} else {
throw new UnsupportedOperationException(String.format("Found unexpected field [%s] in incoming batch.", field));
}
}
rowWriter.addRow(arguments.toArray());
}
return resultSetLoader.harvest();
}
use of org.apache.drill.metastore.metadata.RowGroupMetadata in project drill by apache.
the class ConvertMetadataAggregateToDirectScanRule method populateRecords.
/**
* Populates records list with row group metadata.
*/
private DirectGroupScan populateRecords(Collection<SchemaPath> interestingColumns, Map<String, Class<?>> schema, DrillScanRel scan, ColumnNamesOptions columnNamesOptions) throws IOException {
ParquetGroupScan parquetGroupScan = (ParquetGroupScan) scan.getGroupScan();
DrillTable drillTable = Utilities.getDrillTable(scan.getTable());
Multimap<Path, RowGroupMetadata> rowGroupsMetadataMap = parquetGroupScan.getMetadataProvider().getRowGroupsMetadataMap();
Table<String, Integer, Object> recordsTable = HashBasedTable.create();
FormatSelection selection = (FormatSelection) drillTable.getSelection();
List<String> partitionColumnNames = ColumnExplorer.getPartitionColumnNames(selection.getSelection(), columnNamesOptions);
FileSystem rawFs = selection.getSelection().getSelectionRoot().getFileSystem(new Configuration());
DrillFileSystem fileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), rawFs.getConf());
int rowIndex = 0;
for (Map.Entry<Path, RowGroupMetadata> rgEntry : rowGroupsMetadataMap.entries()) {
Path path = rgEntry.getKey();
RowGroupMetadata rowGroupMetadata = rgEntry.getValue();
List<String> partitionValues = ColumnExplorer.listPartitionValues(path, selection.getSelection().getSelectionRoot(), false);
for (int i = 0; i < partitionValues.size(); i++) {
String partitionColumnName = partitionColumnNames.get(i);
recordsTable.put(partitionColumnName, rowIndex, partitionValues.get(i));
}
recordsTable.put(MetastoreAnalyzeConstants.LOCATION_FIELD, rowIndex, ImplicitFileColumns.FQN.getValue(path));
recordsTable.put(columnNamesOptions.rowGroupIndex(), rowIndex, String.valueOf(rowGroupMetadata.getRowGroupIndex()));
if (interestingColumns == null) {
interestingColumns = rowGroupMetadata.getColumnsStatistics().keySet();
}
// populates record list with row group column metadata
for (SchemaPath schemaPath : interestingColumns) {
ColumnStatistics<?> columnStatistics = rowGroupMetadata.getColumnsStatistics().get(schemaPath);
// do not gather statistics for array columns as it is not supported by Metastore
if (containsArrayColumn(rowGroupMetadata.getSchema(), schemaPath)) {
continue;
}
if (IsPredicate.isNullOrEmpty(columnStatistics)) {
logger.debug("Statistics for {} column wasn't found within {} row group.", schemaPath, path);
return null;
}
for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet()) {
Object statsValue;
if (statisticsKind.getName().equalsIgnoreCase(TableStatisticsKind.ROW_COUNT.getName())) {
statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata);
} else if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.NON_NULL_VALUES_COUNT.getName())) {
statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata) - ColumnStatisticsKind.NULLS_COUNT.getFrom(columnStatistics);
} else {
statsValue = columnStatistics.get(statisticsKind);
}
String columnStatisticsFieldName = AnalyzeColumnUtils.getColumnStatisticsFieldName(schemaPath.toExpr(), statisticsKind);
if (statsValue != null) {
schema.putIfAbsent(columnStatisticsFieldName, statsValue.getClass());
recordsTable.put(columnStatisticsFieldName, rowIndex, statsValue);
} else {
recordsTable.put(columnStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
}
}
}
// populates record list with row group metadata
for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet()) {
String metadataStatisticsFieldName = AnalyzeColumnUtils.getMetadataStatisticsFieldName(statisticsKind);
Object statisticsValue = rowGroupMetadata.getStatistic(statisticsKind);
if (statisticsValue != null) {
schema.putIfAbsent(metadataStatisticsFieldName, statisticsValue.getClass());
recordsTable.put(metadataStatisticsFieldName, rowIndex, statisticsValue);
} else {
recordsTable.put(metadataStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
}
}
// populates record list internal columns
recordsTable.put(MetastoreAnalyzeConstants.SCHEMA_FIELD, rowIndex, rowGroupMetadata.getSchema().jsonString());
recordsTable.put(columnNamesOptions.rowGroupStart(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.START)));
recordsTable.put(columnNamesOptions.rowGroupLength(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
recordsTable.put(columnNamesOptions.lastModifiedTime(), rowIndex, String.valueOf(fileSystem.getFileStatus(path).getModificationTime()));
rowIndex++;
}
// DynamicPojoRecordReader requires LinkedHashMap with fields order
// which corresponds to the value position in record list.
LinkedHashMap<String, Class<?>> orderedSchema = new LinkedHashMap<>();
for (String s : recordsTable.rowKeySet()) {
Class<?> clazz = schema.get(s);
if (clazz != null) {
orderedSchema.put(s, clazz);
} else {
return null;
}
}
IntFunction<List<Object>> collectRecord = currentIndex -> orderedSchema.keySet().stream().map(column -> recordsTable.get(column, currentIndex)).map(value -> value != BaseParquetMetadataProvider.NULL_VALUE ? value : null).collect(Collectors.toList());
List<List<Object>> records = IntStream.range(0, rowIndex).mapToObj(collectRecord).collect(Collectors.toList());
DynamicPojoRecordReader<?> reader = new DynamicPojoRecordReader<>(orderedSchema, records);
ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, records.size(), 1, schema.size());
return new DirectGroupScan(reader, scanStats);
}
Aggregations