use of org.apache.drill.exec.store.dfs.FormatSelection in project drill by apache.
the class ConvertMetadataAggregateToDirectScanRule method populateRecords.
/**
* Populates records list with row group metadata.
*/
private DirectGroupScan populateRecords(Collection<SchemaPath> interestingColumns, Map<String, Class<?>> schema, DrillScanRel scan, ColumnNamesOptions columnNamesOptions) throws IOException {
ParquetGroupScan parquetGroupScan = (ParquetGroupScan) scan.getGroupScan();
DrillTable drillTable = Utilities.getDrillTable(scan.getTable());
Multimap<Path, RowGroupMetadata> rowGroupsMetadataMap = parquetGroupScan.getMetadataProvider().getRowGroupsMetadataMap();
Table<String, Integer, Object> recordsTable = HashBasedTable.create();
FormatSelection selection = (FormatSelection) drillTable.getSelection();
List<String> partitionColumnNames = ColumnExplorer.getPartitionColumnNames(selection.getSelection(), columnNamesOptions);
FileSystem rawFs = selection.getSelection().getSelectionRoot().getFileSystem(new Configuration());
DrillFileSystem fileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), rawFs.getConf());
int rowIndex = 0;
for (Map.Entry<Path, RowGroupMetadata> rgEntry : rowGroupsMetadataMap.entries()) {
Path path = rgEntry.getKey();
RowGroupMetadata rowGroupMetadata = rgEntry.getValue();
List<String> partitionValues = ColumnExplorer.listPartitionValues(path, selection.getSelection().getSelectionRoot(), false);
for (int i = 0; i < partitionValues.size(); i++) {
String partitionColumnName = partitionColumnNames.get(i);
recordsTable.put(partitionColumnName, rowIndex, partitionValues.get(i));
}
recordsTable.put(MetastoreAnalyzeConstants.LOCATION_FIELD, rowIndex, ImplicitFileColumns.FQN.getValue(path));
recordsTable.put(columnNamesOptions.rowGroupIndex(), rowIndex, String.valueOf(rowGroupMetadata.getRowGroupIndex()));
if (interestingColumns == null) {
interestingColumns = rowGroupMetadata.getColumnsStatistics().keySet();
}
// populates record list with row group column metadata
for (SchemaPath schemaPath : interestingColumns) {
ColumnStatistics<?> columnStatistics = rowGroupMetadata.getColumnsStatistics().get(schemaPath);
// do not gather statistics for array columns as it is not supported by Metastore
if (containsArrayColumn(rowGroupMetadata.getSchema(), schemaPath)) {
continue;
}
if (IsPredicate.isNullOrEmpty(columnStatistics)) {
logger.debug("Statistics for {} column wasn't found within {} row group.", schemaPath, path);
return null;
}
for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.COLUMN_STATISTICS_FUNCTIONS.keySet()) {
Object statsValue;
if (statisticsKind.getName().equalsIgnoreCase(TableStatisticsKind.ROW_COUNT.getName())) {
statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata);
} else if (statisticsKind.getName().equalsIgnoreCase(ColumnStatisticsKind.NON_NULL_VALUES_COUNT.getName())) {
statsValue = TableStatisticsKind.ROW_COUNT.getValue(rowGroupMetadata) - ColumnStatisticsKind.NULLS_COUNT.getFrom(columnStatistics);
} else {
statsValue = columnStatistics.get(statisticsKind);
}
String columnStatisticsFieldName = AnalyzeColumnUtils.getColumnStatisticsFieldName(schemaPath.toExpr(), statisticsKind);
if (statsValue != null) {
schema.putIfAbsent(columnStatisticsFieldName, statsValue.getClass());
recordsTable.put(columnStatisticsFieldName, rowIndex, statsValue);
} else {
recordsTable.put(columnStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
}
}
}
// populates record list with row group metadata
for (StatisticsKind<?> statisticsKind : AnalyzeColumnUtils.META_STATISTICS_FUNCTIONS.keySet()) {
String metadataStatisticsFieldName = AnalyzeColumnUtils.getMetadataStatisticsFieldName(statisticsKind);
Object statisticsValue = rowGroupMetadata.getStatistic(statisticsKind);
if (statisticsValue != null) {
schema.putIfAbsent(metadataStatisticsFieldName, statisticsValue.getClass());
recordsTable.put(metadataStatisticsFieldName, rowIndex, statisticsValue);
} else {
recordsTable.put(metadataStatisticsFieldName, rowIndex, BaseParquetMetadataProvider.NULL_VALUE);
}
}
// populates record list internal columns
recordsTable.put(MetastoreAnalyzeConstants.SCHEMA_FIELD, rowIndex, rowGroupMetadata.getSchema().jsonString());
recordsTable.put(columnNamesOptions.rowGroupStart(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.START)));
recordsTable.put(columnNamesOptions.rowGroupLength(), rowIndex, Long.toString(rowGroupMetadata.getStatistic(() -> ExactStatisticsConstants.LENGTH)));
recordsTable.put(columnNamesOptions.lastModifiedTime(), rowIndex, String.valueOf(fileSystem.getFileStatus(path).getModificationTime()));
rowIndex++;
}
// DynamicPojoRecordReader requires LinkedHashMap with fields order
// which corresponds to the value position in record list.
LinkedHashMap<String, Class<?>> orderedSchema = new LinkedHashMap<>();
for (String s : recordsTable.rowKeySet()) {
Class<?> clazz = schema.get(s);
if (clazz != null) {
orderedSchema.put(s, clazz);
} else {
return null;
}
}
IntFunction<List<Object>> collectRecord = currentIndex -> orderedSchema.keySet().stream().map(column -> recordsTable.get(column, currentIndex)).map(value -> value != BaseParquetMetadataProvider.NULL_VALUE ? value : null).collect(Collectors.toList());
List<List<Object>> records = IntStream.range(0, rowIndex).mapToObj(collectRecord).collect(Collectors.toList());
DynamicPojoRecordReader<?> reader = new DynamicPojoRecordReader<>(orderedSchema, records);
ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, records.size(), 1, schema.size());
return new DirectGroupScan(reader, scanStats);
}
use of org.apache.drill.exec.store.dfs.FormatSelection in project drill by apache.
the class ConvertCountToDirectScanRule method onMatch.
@Override
public void onMatch(RelOptRuleCall call) {
final Aggregate agg = call.rel(0);
final TableScan scan = call.rel(call.rels.length - 1);
final Project project = call.rels.length == 3 ? (Project) call.rel(1) : null;
// 3) Additional checks are done further below ..
if (agg.getGroupCount() > 0 || agg.containsDistinctCall()) {
return;
}
DrillTable drillTable = DrillRelOptUtil.getDrillTable(scan);
if (drillTable == null) {
logger.debug("Rule does not apply since an eligible drill table instance was not found.");
return;
}
Object selection = drillTable.getSelection();
if (!(selection instanceof FormatSelection)) {
logger.debug("Rule does not apply since only Parquet file format is eligible.");
return;
}
PlannerSettings settings = call.getPlanner().getContext().unwrap(PlannerSettings.class);
// Rule is applicable only if the statistics for row count and null count are available from the metadata,
FormatSelection formatSelection = (FormatSelection) selection;
// Rule cannot be applied if the selection had wildcard since the totalrowcount cannot be read from the parent directory
if (formatSelection.getSelection().hadWildcard()) {
logger.debug("Rule does not apply when there is a wild card since the COUNT could not be determined from metadata.");
return;
}
Pair<Boolean, Metadata_V4.MetadataSummary> status = checkMetadataForScanStats(settings, drillTable, formatSelection);
if (!status.getLeft()) {
logger.debug("Rule does not apply since MetadataSummary metadata was not found.");
return;
}
Metadata_V4.MetadataSummary metadataSummary = status.getRight();
Map<String, Long> result = collectCounts(settings, metadataSummary, agg, scan, project);
logger.trace("Calculated the following aggregate counts: {}", result);
// if counts could not be determined, rule won't be applied
if (result.isEmpty()) {
logger.debug("Rule does not apply since one or more COUNTs could not be determined from metadata.");
return;
}
Path summaryFileName = Metadata.getSummaryFileName(formatSelection.getSelection().getSelectionRoot());
final RelDataType scanRowType = CountToDirectScanUtils.constructDataType(agg, result.keySet());
final DynamicPojoRecordReader<Long> reader = new DynamicPojoRecordReader<>(CountToDirectScanUtils.buildSchema(scanRowType.getFieldNames()), Collections.singletonList(new ArrayList<>(result.values())));
final ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, 1, 1, scanRowType.getFieldCount());
final MetadataDirectGroupScan directScan = new MetadataDirectGroupScan(reader, summaryFileName, 1, scanStats, true, false);
final DrillDirectScanRel newScan = new DrillDirectScanRel(scan.getCluster(), scan.getTraitSet().plus(DrillRel.DRILL_LOGICAL), directScan, scanRowType);
final DrillProjectRel newProject = new DrillProjectRel(agg.getCluster(), agg.getTraitSet().plus(DrillRel.DRILL_LOGICAL), newScan, CountToDirectScanUtils.prepareFieldExpressions(scanRowType), agg.getRowType());
call.transformTo(newProject);
}
use of org.apache.drill.exec.store.dfs.FormatSelection in project drill by apache.
the class RefreshMetadataHandler method getPlan.
@Override
public PhysicalPlan getPlan(SqlNode sqlNode) throws ForemanSetupException {
final SqlRefreshMetadata refreshTable = unwrap(sqlNode, SqlRefreshMetadata.class);
try {
final SchemaPlus schema = findSchema(config.getConverter().getDefaultSchema(), refreshTable.getSchemaPath());
if (schema == null) {
return direct(false, "Storage plugin or workspace does not exist [%s]", SchemaUtilites.SCHEMA_PATH_JOINER.join(refreshTable.getSchemaPath()));
}
final String tableName = refreshTable.getName();
final SqlNodeList columnList = getColumnList(refreshTable);
final Set<SchemaPath> columnSet = getColumnRootSegments(columnList);
final SqlLiteral allColumns = refreshTable.getAllColumns();
if (tableName.contains("*") || tableName.contains("?")) {
return direct(false, "Glob path %s not supported for metadata refresh", tableName);
}
final Table table = schema.getTable(tableName);
if (table == null) {
return direct(false, "Table %s does not exist.", tableName);
}
if (!(table instanceof DrillTable)) {
return notSupported(tableName);
}
final DrillTable drillTable = (DrillTable) table;
final Object selection = drillTable.getSelection();
if (selection instanceof FileSelection && ((FileSelection) selection).isEmptyDirectory()) {
return direct(false, "Table %s is empty and doesn't contain any parquet files.", tableName);
}
if (!(selection instanceof FormatSelection)) {
return notSupported(tableName);
}
final FormatSelection formatSelection = (FormatSelection) selection;
FormatPluginConfig formatConfig = formatSelection.getFormat();
if (!((formatConfig instanceof ParquetFormatConfig) || ((formatConfig instanceof NamedFormatPluginConfig) && ((NamedFormatPluginConfig) formatConfig).getName().equals("parquet")))) {
return notSupported(tableName);
}
// Always create filesystem object using process user, since it owns the metadata file
final DrillFileSystem fs = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), drillTable.getPlugin().getFormatPlugin(formatConfig).getFsConf());
final Path selectionRoot = formatSelection.getSelection().getSelectionRoot();
if (!fs.getFileStatus(selectionRoot).isDirectory()) {
return notSupported(tableName);
}
if (!(formatConfig instanceof ParquetFormatConfig)) {
formatConfig = new ParquetFormatConfig();
}
final ParquetReaderConfig readerConfig = ParquetReaderConfig.builder().withFormatConfig((ParquetFormatConfig) formatConfig).withOptions(context.getOptions()).build();
Metadata.createMeta(fs, selectionRoot, readerConfig, allColumns.booleanValue(), columnSet);
return direct(true, "Successfully updated metadata for table %s.", tableName);
} catch (Exception e) {
logger.error("Failed to update metadata for table '{}'", refreshTable.getName(), e);
return DirectPlan.createDirectPlan(context, false, String.format("Error: %s", e.getMessage()));
}
}
use of org.apache.drill.exec.store.dfs.FormatSelection in project drill by apache.
the class IcebergFormatMatcher method isReadable.
@Override
public DrillTable isReadable(DrillFileSystem fs, FileSelection selection, FileSystemPlugin fsPlugin, String storageEngineName, SchemaConfig schemaConfig) throws IOException {
Path selectionRoot = selection.getSelectionRoot();
Path metaDir = new Path(selectionRoot, METADATA_DIR_NAME);
if (fs.isDirectory(selectionRoot) && fs.exists(metaDir) && fs.isDirectory(metaDir)) {
FormatSelection formatSelection = new FormatSelection(formatPlugin.getConfig(), selection);
return new PluginDrillTable(fsPlugin, storageEngineName, schemaConfig.getUserName(), formatSelection, formatPlugin.getConvention());
}
return null;
}
use of org.apache.drill.exec.store.dfs.FormatSelection in project drill by apache.
the class AnalyzeFileInfoProvider method getSegmentColumns.
@Override
public List<SchemaPath> getSegmentColumns(DrillTable table, ColumnNamesOptions columnNamesOptions) throws IOException {
FormatSelection selection = (FormatSelection) table.getSelection();
FileSelection fileSelection = selection.getSelection();
if (!fileSelection.isExpandedFully()) {
fileSelection = FileMetadataInfoCollector.getExpandedFileSelection(fileSelection);
}
return ColumnExplorer.getPartitionColumnNames(fileSelection, columnNamesOptions).stream().map(SchemaPath::getSimplePath).collect(Collectors.toList());
}
Aggregations