use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.
the class HiveFunctionRegistry method register.
private <I> void register(Class<? extends I> clazz, Multimap<String, Class<? extends I>> methods) {
Description desc = clazz.getAnnotation(Description.class);
Stream<String> namesStream;
if (desc != null) {
namesStream = Stream.of(desc.name().split(",")).map(String::trim);
} else {
namesStream = Stream.of(clazz).map(Class::getName).map(name -> name.replace('.', '_'));
}
// Checks specified array of function names whether they should be replaced
// using FUNCTION_REPLACE_MAP map.
namesStream.map(String::toLowerCase).map(functionName -> FUNCTION_REPLACE_MAP.getOrDefault(functionName, functionName)).forEach(name -> methods.put(name, clazz));
UDFType type = clazz.getAnnotation(UDFType.class);
if (type != null && !type.deterministic()) {
nonDeterministicUDFs.add(clazz);
}
}
use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.
the class AbstractParquetGroupScan method modifyFileSelection.
// limit push down methods end
// helper method used for partition pruning and filter push down
@Override
public void modifyFileSelection(FileSelection selection) {
super.modifyFileSelection(selection);
List<Path> files = selection.getFiles();
fileSet = new HashSet<>(files);
entries = new ArrayList<>(files.size());
entries.addAll(files.stream().map(ReadEntryWithPath::new).collect(Collectors.toList()));
Multimap<Path, RowGroupMetadata> newRowGroups = LinkedListMultimap.create();
if (!getRowGroupsMetadata().isEmpty()) {
getRowGroupsMetadata().entries().stream().filter(entry -> fileSet.contains(entry.getKey())).forEachOrdered(entry -> newRowGroups.put(entry.getKey(), entry.getValue()));
}
this.rowGroups = newRowGroups;
tableMetadata = TableMetadataUtils.updateRowCount(getTableMetadata(), getRowGroupsMetadata().values());
if (!getFilesMetadata().isEmpty()) {
this.files = getFilesMetadata().entrySet().stream().filter(entry -> fileSet.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
} else {
this.files = Collections.emptyMap();
}
List<PartitionMetadata> newPartitions = new ArrayList<>();
if (!getPartitionsMetadata().isEmpty()) {
for (PartitionMetadata entry : getPartitionsMetadata()) {
for (Path partLocation : entry.getLocations()) {
if (fileSet.contains(partLocation)) {
newPartitions.add(entry);
break;
}
}
}
}
partitions = newPartitions;
if (!getSegmentsMetadata().isEmpty()) {
this.segments = getSegmentsMetadata().entrySet().stream().filter(entry -> fileSet.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}
rowGroupInfos = null;
}
use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.
the class MetastoreAnalyzeTableHandler method convertToDrel.
/**
* Converts to Drill logical plan
*/
private DrillRel convertToDrel(RelNode relNode, SqlMetastoreAnalyzeTable sqlAnalyzeTable, DrillTableInfo drillTableInfo) throws ForemanSetupException, IOException {
RelBuilder relBuilder = LOGICAL_BUILDER.create(relNode.getCluster(), null);
DrillTable table = drillTableInfo.drillTable();
AnalyzeInfoProvider analyzeInfoProvider = table.getGroupScan().getAnalyzeInfoProvider();
List<String> schemaPath = drillTableInfo.schemaPath();
String pluginName = schemaPath.get(0);
String workspaceName = Strings.join(schemaPath.subList(1, schemaPath.size()), AbstractSchema.SCHEMA_SEPARATOR);
String tableName = drillTableInfo.tableName();
TableInfo tableInfo = TableInfo.builder().name(tableName).owner(table.getUserName()).type(analyzeInfoProvider.getTableTypeName()).storagePlugin(pluginName).workspace(workspaceName).build();
ColumnNamesOptions columnNamesOptions = new ColumnNamesOptions(context.getOptions());
List<String> segmentColumns = analyzeInfoProvider.getSegmentColumns(table, columnNamesOptions).stream().map(SchemaPath::getRootSegmentPath).collect(Collectors.toList());
List<NamedExpression> segmentExpressions = segmentColumns.stream().map(partitionName -> new NamedExpression(SchemaPath.getSimplePath(partitionName), FieldReference.getWithQuotedRef(partitionName))).collect(Collectors.toList());
List<MetadataInfo> rowGroupsInfo = Collections.emptyList();
List<MetadataInfo> filesInfo = Collections.emptyList();
Multimap<Integer, MetadataInfo> segments = ArrayListMultimap.create();
BasicTablesRequests basicRequests;
try {
basicRequests = context.getMetastoreRegistry().get().tables().basicRequests();
} catch (MetastoreException e) {
logger.error("Error when obtaining Metastore instance for table {}", tableName, e);
DrillRel convertedRelNode = convertToRawDrel(relBuilder.values(new String[] { MetastoreAnalyzeConstants.OK_FIELD_NAME, MetastoreAnalyzeConstants.SUMMARY_FIELD_NAME }, false, e.getMessage()).build());
return new DrillScreenRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertedRelNode);
}
MetadataType metadataLevel = getMetadataType(sqlAnalyzeTable);
List<SchemaPath> interestingColumns = sqlAnalyzeTable.getFieldNames();
MetastoreTableInfo metastoreTableInfo = basicRequests.metastoreTableInfo(tableInfo);
List<MetadataInfo> allMetaToHandle = null;
List<MetadataInfo> metadataToRemove = new ArrayList<>();
// whether incremental analyze may be produced
if (metastoreTableInfo.isExists()) {
RelNode finalRelNode = relNode;
CheckedSupplier<TableScan, SqlUnsupportedException> tableScanSupplier = () -> DrillRelOptUtil.findScan(convertToDrel(finalRelNode.getInput(0)));
MetadataInfoCollector metadataInfoCollector = analyzeInfoProvider.getMetadataInfoCollector(basicRequests, tableInfo, (FormatSelection) table.getSelection(), context.getPlannerSettings(), tableScanSupplier, interestingColumns, metadataLevel, segmentColumns.size());
if (!metadataInfoCollector.isOutdated()) {
DrillRel convertedRelNode = convertToRawDrel(relBuilder.values(new String[] { MetastoreAnalyzeConstants.OK_FIELD_NAME, MetastoreAnalyzeConstants.SUMMARY_FIELD_NAME }, false, "Table metadata is up to date, analyze wasn't performed.").build());
return new DrillScreenRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertedRelNode);
}
// updates scan to read updated / new files, pass removed files into metadata handler
relNode = relNode.copy(relNode.getTraitSet(), Collections.singletonList(metadataInfoCollector.getPrunedScan()));
filesInfo = metadataInfoCollector.getFilesInfo();
segments = metadataInfoCollector.getSegmentsInfo();
rowGroupsInfo = metadataInfoCollector.getRowGroupsInfo();
allMetaToHandle = metadataInfoCollector.getAllMetaToHandle();
metadataToRemove = metadataInfoCollector.getMetadataToRemove();
}
// Step 2: constructs plan for producing analyze
DrillRel convertedRelNode = convertToRawDrel(relNode);
boolean createNewAggregations = true;
// List of columns for which statistics should be collected: interesting columns + segment columns
List<SchemaPath> statisticsColumns = interestingColumns == null ? null : new ArrayList<>(interestingColumns);
if (statisticsColumns != null) {
segmentColumns.stream().map(SchemaPath::getSimplePath).forEach(statisticsColumns::add);
}
SchemaPath locationField = analyzeInfoProvider.getLocationField(columnNamesOptions);
if (analyzeInfoProvider.supportsMetadataType(MetadataType.ROW_GROUP) && metadataLevel.includes(MetadataType.ROW_GROUP)) {
MetadataHandlerContext handlerContext = MetadataHandlerContext.builder().tableInfo(tableInfo).metadataToHandle(rowGroupsInfo).metadataType(MetadataType.ROW_GROUP).depthLevel(segmentExpressions.size()).segmentColumns(segmentColumns).build();
convertedRelNode = getRowGroupAggRelNode(segmentExpressions, convertedRelNode, createNewAggregations, statisticsColumns, handlerContext);
createNewAggregations = false;
locationField = SchemaPath.getSimplePath(MetastoreAnalyzeConstants.LOCATION_FIELD);
}
if (analyzeInfoProvider.supportsMetadataType(MetadataType.FILE) && metadataLevel.includes(MetadataType.FILE)) {
MetadataHandlerContext handlerContext = MetadataHandlerContext.builder().tableInfo(tableInfo).metadataToHandle(filesInfo).metadataType(MetadataType.FILE).depthLevel(segmentExpressions.size()).segmentColumns(segmentColumns).build();
convertedRelNode = getFileAggRelNode(segmentExpressions, convertedRelNode, createNewAggregations, statisticsColumns, locationField, handlerContext);
locationField = SchemaPath.getSimplePath(MetastoreAnalyzeConstants.LOCATION_FIELD);
createNewAggregations = false;
}
if (analyzeInfoProvider.supportsMetadataType(MetadataType.SEGMENT) && metadataLevel.includes(MetadataType.SEGMENT)) {
for (int i = segmentExpressions.size(); i > 0; i--) {
MetadataHandlerContext handlerContext = MetadataHandlerContext.builder().tableInfo(tableInfo).metadataToHandle(new ArrayList<>(segments.get(i - 1))).metadataType(MetadataType.SEGMENT).depthLevel(i).segmentColumns(segmentColumns.subList(0, i)).build();
convertedRelNode = getSegmentAggRelNode(segmentExpressions, convertedRelNode, createNewAggregations, statisticsColumns, locationField, i, handlerContext);
locationField = SchemaPath.getSimplePath(MetastoreAnalyzeConstants.LOCATION_FIELD);
createNewAggregations = false;
}
}
if (analyzeInfoProvider.supportsMetadataType(MetadataType.TABLE) && metadataLevel.includes(MetadataType.TABLE)) {
MetadataHandlerContext handlerContext = MetadataHandlerContext.builder().tableInfo(tableInfo).metadataToHandle(Collections.emptyList()).metadataType(MetadataType.TABLE).depthLevel(segmentExpressions.size()).segmentColumns(segmentColumns).build();
convertedRelNode = getTableAggRelNode(convertedRelNode, createNewAggregations, statisticsColumns, locationField, handlerContext);
} else {
throw new IllegalStateException("Analyze table with NONE level");
}
boolean useStatistics = context.getOptions().getOption(PlannerSettings.STATISTICS_USE);
SqlNumericLiteral samplePercentLiteral = sqlAnalyzeTable.getSamplePercent();
double samplePercent = samplePercentLiteral == null ? 100.0 : samplePercentLiteral.intValue(true);
// Step 3: adds rel nodes for producing statistics analyze if required
RelNode analyzeRel = useStatistics ? new DrillAnalyzeRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertToRawDrel(relNode), samplePercent) : convertToRawDrel(relBuilder.values(new String[] { "" }, "").build());
MetadataControllerContext metadataControllerContext = MetadataControllerContext.builder().tableInfo(tableInfo).metastoreTableInfo(metastoreTableInfo).location(((FormatSelection) table.getSelection()).getSelection().getSelectionRoot()).interestingColumns(interestingColumns).segmentColumns(segmentColumns).metadataToHandle(allMetaToHandle).metadataToRemove(metadataToRemove).analyzeMetadataLevel(metadataLevel).build();
convertedRelNode = new MetadataControllerRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertedRelNode, analyzeRel, metadataControllerContext);
return new DrillScreenRel(convertedRelNode.getCluster(), convertedRelNode.getTraitSet(), convertedRelNode);
}
use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.
the class MetadataControllerBatch method getColumnStatistics.
private Map<SchemaPath, ColumnStatistics<?>> getColumnStatistics(TupleReader reader, TupleMetadata columnMetadata, Long rowCount) {
Multimap<String, StatisticsHolder<?>> columnStatistics = ArrayListMultimap.create();
Map<String, TypeProtos.MinorType> columnTypes = new HashMap<>();
for (ColumnMetadata column : columnMetadata) {
if (AnalyzeColumnUtils.isColumnStatisticsField(column.name())) {
String fieldName = AnalyzeColumnUtils.getColumnName(column.name());
StatisticsKind<?> statisticsKind = AnalyzeColumnUtils.getStatisticsKind(column.name());
columnStatistics.put(fieldName, new StatisticsHolder<>(getConvertedColumnValue(reader.column(column.name())), statisticsKind));
if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MIN_VALUE.getName()) || statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.MAX_VALUE.getName())) {
columnTypes.putIfAbsent(fieldName, column.type());
}
}
}
// adds NON_NULL_COUNT to use it during filter pushdown
if (rowCount != null) {
Map<String, StatisticsHolder<?>> nullsCountColumnStatistics = new HashMap<>();
columnStatistics.asMap().forEach((key, value) -> value.stream().filter(statisticsHolder -> statisticsHolder.getStatisticsKind() == ColumnStatisticsKind.NON_NULL_VALUES_COUNT).findAny().map(statisticsHolder -> (Long) statisticsHolder.getStatisticsValue()).ifPresent(nonNullCount -> nullsCountColumnStatistics.put(key, new StatisticsHolder<>(rowCount - nonNullCount, ColumnStatisticsKind.NULLS_COUNT))));
nullsCountColumnStatistics.forEach(columnStatistics::put);
}
Map<SchemaPath, ColumnStatistics<?>> resultingStats = new HashMap<>();
columnStatistics.asMap().forEach((fieldName, statisticsHolders) -> resultingStats.put(SchemaPath.parseFromString(fieldName), new ColumnStatistics<>(statisticsHolders, columnTypes.get(fieldName))));
return resultingStats;
}
use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.
the class ConvertMetadataAggregateToDirectScanRule method populateRecords.
/**
* Populates records list with row group metadata.
*/
private DirectGroupScan populateRecords(Collection<SchemaPath> interestingColumns, Map<String, Class<?>> schema, DrillScanRel scan, ColumnNamesOptions columnNamesOptions) throws IOException {
ParquetGroupScan parquetGroupScan = (ParquetGroupScan) scan.getGroupScan();
DrillTable drillTable = Utilities.getDrillTable(scan.getTable());
Multimap<Path, RowGroupMetadata> rowGroupsMetadataMap = parquetGroupScan.getMetadataProvider().getRowGroupsMetadataMap();
Table<String, Integer, Object> recordsTable = HashBasedTable.create();
FormatSelection selection = (FormatSelection) drillTable.getSelection();
List<String> partitionColumnNames = ColumnExplorer.getPartitionColumnNames(selection.getSelection(), columnNamesOptions);
FileSystem rawFs = selection.getSelection().getSelectionRoot().getFileSystem(new Configuration());
DrillFileSystem fileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), rawFs.getConf());
int rowIndex = 0;
for (Map.Entry<Path, RowGroupMetadata> rgEntry : rowGroupsMetadataMap.entries()) {
Path path = rgEntry.getKey();
RowGroupMetadata rowGroupMetadata = rgEntry.getValue();
List<String> partitionValues = ColumnExplorer.listPartitionValues(path, selection.getSelection().getSelectionRoot(), false);
for (int i = 0; i < partitionValues.size(); i++) {
String partitionColumnName = partitionColumnNames.get(i);
recordsTable.put(partitionColumnName, rowIndex, partitionValues.get(i));
}
recordsTable.put(MetastoreAnalyzeConstants.LOCATION_FIELD, rowIndex, ImplicitFileColumns.FQN.getValue(path));
recordsTable.put(columnNamesOptions.rowGroupIndex(), rowIndex, String.valueOf(rowGroupMetadata.getRowGroupIndex()));
if (interestingColumns == null) {
interestingColumns = rowGroupMetadata.getColumnsStatistics().keySet();
}
// populates record list with row group column metadata
for (SchemaPath schemaPath : interestingColumns) {
ColumnStatistics<?> columnStatistics = rowGroupMetadata.getColumnsStatistics().get(schemaPath);
// do not gather statistics for array columns as it is not supported by Metastore
if (containsArrayColumn(rowGroupMetadata.getSchema(), schemaPath)) {
continue;
}
if (IsPredicate.isNullOrEmpty(columnStatistics)) {
logger.debug("Statistics for {} column wasn't found within {} row group.", schemaPath, path);
return null;
}
for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet()) {
Object statsValue;
if (statisticsKind.getName().equalsIgnoreCase(TableStatisticsKind.ROW_COUNT.getName())) {
statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata);
} else if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.NON_NULL_VALUES_COUNT.getName())) {
statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata) - ColumnStatisticsKind.NULLS_COUNT.getFrom(columnStatistics);
} else {
statsValue = columnStatistics.get(statisticsKind);
}
String columnStatisticsFieldName = AnalyzeColumnUtils.getColumnStatisticsFieldName(schemaPath.toExpr(), statisticsKind);
if (statsValue != null) {
schema.putIfAbsent(columnStatisticsFieldName, statsValue.getClass());
recordsTable.put(columnStatisticsFieldName, rowIndex, statsValue);
} else {
recordsTable.put(columnStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
}
}
}
// populates record list with row group metadata
for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet()) {
String metadataStatisticsFieldName = AnalyzeColumnUtils.getMetadataStatisticsFieldName(statisticsKind);
Object statisticsValue = rowGroupMetadata.getStatistic(statisticsKind);
if (statisticsValue != null) {
schema.putIfAbsent(metadataStatisticsFieldName, statisticsValue.getClass());
recordsTable.put(metadataStatisticsFieldName, rowIndex, statisticsValue);
} else {
recordsTable.put(metadataStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
}
}
// populates record list internal columns
recordsTable.put(MetastoreAnalyzeConstants.SCHEMA_FIELD, rowIndex, rowGroupMetadata.getSchema().jsonString());
recordsTable.put(columnNamesOptions.rowGroupStart(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.START)));
recordsTable.put(columnNamesOptions.rowGroupLength(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
recordsTable.put(columnNamesOptions.lastModifiedTime(), rowIndex, String.valueOf(fileSystem.getFileStatus(path).getModificationTime()));
rowIndex++;
}
// DynamicPojoRecordReader requires LinkedHashMap with fields order
// which corresponds to the value position in record list.
LinkedHashMap<String, Class<?>> orderedSchema = new LinkedHashMap<>();
for (String s : recordsTable.rowKeySet()) {
Class<?> clazz = schema.get(s);
if (clazz != null) {
orderedSchema.put(s, clazz);
} else {
return null;
}
}
IntFunction<List<Object>> collectRecord = currentIndex -> orderedSchema.keySet().stream().map(column -> recordsTable.get(column, currentIndex)).map(value -> value != BaseParquetMetadataProvider.NULL_VALUE ? value : null).collect(Collectors.toList());
List<List<Object>> records = IntStream.range(0, rowIndex).mapToObj(collectRecord).collect(Collectors.toList());
DynamicPojoRecordReader<?> reader = new DynamicPojoRecordReader<>(orderedSchema, records);
ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, records.size(), 1, schema.size());
return new DirectGroupScan(reader, scanStats);
}
Aggregations