use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class ParquetTableMetadataUtils method getRowGroupMetadata.
/**
* Returns {@link RowGroupMetadata} instance converted from specified parquet {@code rowGroupMetadata}.
*
* @param tableMetadata table metadata which contains row group metadata to convert
* @param rowGroupMetadata row group metadata to convert
* @param rgIndexInFile index of current row group within the file
* @param location location of file with current row group
* @return {@link RowGroupMetadata} instance converted from specified parquet {@code rowGroupMetadata}
*/
public static RowGroupMetadata getRowGroupMetadata(MetadataBase.ParquetTableMetadataBase tableMetadata, MetadataBase.RowGroupMetadata rowGroupMetadata, int rgIndexInFile, Path location) {
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = getRowGroupColumnStatistics(tableMetadata, rowGroupMetadata);
List<StatisticsHolder<?>> rowGroupStatistics = new ArrayList<>();
rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getRowCount(), TableStatisticsKind.ROW_COUNT));
rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getStart(), new BaseStatisticsKind<>(ExactStatisticsConstants.START, true)));
rowGroupStatistics.add(new StatisticsHolder<>(rowGroupMetadata.getLength(), new BaseStatisticsKind<>(ExactStatisticsConstants.LENGTH, true)));
Map<SchemaPath, TypeProtos.MajorType> columns = getRowGroupFields(tableMetadata, rowGroupMetadata);
Map<SchemaPath, TypeProtos.MajorType> intermediateColumns = getIntermediateFields(tableMetadata, rowGroupMetadata);
TupleMetadata schema = new TupleSchema();
columns.forEach((schemaPath, majorType) -> SchemaPathUtils.addColumnMetadata(schema, schemaPath, majorType, intermediateColumns));
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.ROW_GROUP).build();
return RowGroupMetadata.builder().tableInfo(TableInfo.UNKNOWN_TABLE_INFO).metadataInfo(metadataInfo).schema(schema).columnsStatistics(columnsStatistics).metadataStatistics(rowGroupStatistics).hostAffinity(rowGroupMetadata.getHostAffinity()).rowGroupIndex(rgIndexInFile).path(location).build();
}
use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class ParquetTableMetadataUtils method getPartitionMetadata.
/**
* Returns {@link PartitionMetadata} instance received by merging specified {@link FileMetadata} list.
*
* @param partitionColumn partition column
* @param files list of files to be merged
* @return {@link PartitionMetadata} instance
*/
public static PartitionMetadata getPartitionMetadata(SchemaPath partitionColumn, List<FileMetadata> files) {
Set<Path> locations = new HashSet<>();
Set<SchemaPath> columns = new HashSet<>();
for (FileMetadata file : files) {
columns.addAll(file.getColumnsStatistics().keySet());
locations.add(file.getPath());
}
FileMetadata fileMetadata = files.iterator().next();
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.PARTITION).build();
return PartitionMetadata.builder().tableInfo(fileMetadata.getTableInfo()).metadataInfo(metadataInfo).column(partitionColumn).schema(fileMetadata.getSchema()).columnsStatistics(TableMetadataUtils.mergeColumnsStatistics(files, columns, PARQUET_COLUMN_STATISTICS)).metadataStatistics(Collections.singletonList(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(files), TableStatisticsKind.ROW_COUNT))).partitionValues(Collections.emptyList()).locations(locations).build();
}
use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class BaseParquetMetadataProvider method combineToSegmentMetadata.
/**
* Returns {@link SegmentMetadata} which is combined metadata of list of specified metadata
*
* @param metadataList metadata to combine
* @param column segment column
* @param metadataLocations locations of metadata combined in resulting segment
* @param <T> type of metadata to combine
* @return {@link SegmentMetadata} from combined metadata
*/
private static <T extends BaseMetadata & LocationProvider> SegmentMetadata combineToSegmentMetadata(Collection<T> metadataList, SchemaPath column, Set<Path> metadataLocations) {
List<StatisticsHolder<?>> segmentStatistics = Collections.singletonList(new StatisticsHolder<>(TableStatisticsKind.ROW_COUNT.mergeStatistics(metadataList), TableStatisticsKind.ROW_COUNT));
// this code is used only to collect segment metadata to be used only during filtering,
// so metadata identifier is not required here and in other places in this class
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.SEGMENT).build();
T firstMetadata = metadataList.iterator().next();
return SegmentMetadata.builder().tableInfo(firstMetadata.getTableInfo()).metadataInfo(metadataInfo).column(column).schema(firstMetadata.getSchema()).path(firstMetadata.getPath().getParent()).columnsStatistics(TableMetadataUtils.mergeColumnsStatistics(metadataList, firstMetadata.getColumnsStatistics().keySet(), PARQUET_COLUMN_STATISTICS)).metadataStatistics(segmentStatistics).partitionValues(Collections.emptyList()).locations(metadataLocations).build();
}
use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class MetadataControllerBatch method getRowGroupMetadata.
private RowGroupMetadata getRowGroupMetadata(TupleReader reader, List<StatisticsHolder<?>> metadataStatistics, Map<SchemaPath, ColumnStatistics<?>> columnStatistics, int nestingLevel) {
List<String> segmentColumns = popConfig.getContext().segmentColumns();
String segmentKey = segmentColumns.size() > 0 ? reader.column(segmentColumns.iterator().next()).scalar().getString() : MetadataInfo.DEFAULT_SEGMENT_KEY;
List<String> partitionValues = segmentColumns.stream().limit(nestingLevel - 2).map(columnName -> reader.column(columnName).scalar().getString()).collect(Collectors.toList());
Path path = new Path(reader.column(MetastoreAnalyzeConstants.LOCATION_FIELD).scalar().getString());
int rowGroupIndex = Integer.parseInt(reader.column(columnNamesOptions.rowGroupIndex()).scalar().getString());
String metadataIdentifier = MetadataIdentifierUtils.getRowGroupMetadataIdentifier(partitionValues, path, rowGroupIndex);
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.ROW_GROUP).key(segmentKey).identifier(StringUtils.defaultIfEmpty(metadataIdentifier, null)).build();
return RowGroupMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(columnStatistics).metadataStatistics(metadataStatistics).hostAffinity(Collections.emptyMap()).rowGroupIndex(rowGroupIndex).path(path).lastModifiedTime(Long.parseLong(reader.column(columnNamesOptions.lastModifiedTime()).scalar().getString())).schema(TupleMetadata.of(reader.column(MetastoreAnalyzeConstants.SCHEMA_FIELD).scalar().getString())).build();
}
use of org.apache.drill.metastore.metadata.MetadataInfo in project drill by apache.
the class MetadataControllerBatch method getSegmentMetadata.
private SegmentMetadata getSegmentMetadata(TupleReader reader, List<StatisticsHolder<?>> metadataStatistics, Map<SchemaPath, ColumnStatistics<?>> columnStatistics, int nestingLevel) {
List<String> segmentColumns = popConfig.getContext().segmentColumns();
String segmentKey = segmentColumns.size() > 0 ? reader.column(segmentColumns.iterator().next()).scalar().getString() : MetadataInfo.DEFAULT_SEGMENT_KEY;
// and therefore all values should be used when forming metadata identifier
if (popConfig.getContext().multiValueSegments()) {
nestingLevel = segmentColumns.size();
}
List<String> allPartitionValues = segmentColumns.stream().limit(nestingLevel).map(columnName -> reader.column(columnName).scalar().getString()).collect(Collectors.toList());
String metadataIdentifier = MetadataIdentifierUtils.getMetadataIdentifierKey(allPartitionValues);
MetadataInfo metadataInfo = MetadataInfo.builder().type(MetadataType.SEGMENT).key(segmentKey).identifier(StringUtils.defaultIfEmpty(metadataIdentifier, null)).build();
int segmentLevel = nestingLevel - 1;
// for the case of multi-value segments, there is no nesting,
// so all partition column values should be used
List<String> partitionValues = popConfig.getContext().multiValueSegments() ? allPartitionValues : Collections.singletonList(allPartitionValues.get(segmentLevel));
return SegmentMetadata.builder().tableInfo(tableInfo).metadataInfo(metadataInfo).columnsStatistics(columnStatistics).metadataStatistics(metadataStatistics).path(new Path(reader.column(MetastoreAnalyzeConstants.LOCATION_FIELD).scalar().getString())).locations(getIncomingLocations(reader)).column(segmentColumns.size() > 0 ? SchemaPath.getSimplePath(segmentColumns.get(segmentLevel)) : null).partitionValues(partitionValues).lastModifiedTime(Long.parseLong(reader.column(columnNamesOptions.lastModifiedTime()).scalar().getString())).schema(TupleMetadata.of(reader.column(MetastoreAnalyzeConstants.SCHEMA_FIELD).scalar().getString())).build();
}
Aggregations