use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.
the class Metadata method getSummary.
/**
* Reads the summary from the metadata cache file, if the cache file is stale recreates the metadata
* @param fs file system
* @param metadataParentDir parent directory that holds metadata files
* @param autoRefreshTriggered true if the auto-refresh is already triggered
* @param readerConfig Parquet reader config
* @return returns metadata summary
*/
public static Metadata_V4.MetadataSummary getSummary(FileSystem fs, Path metadataParentDir, boolean autoRefreshTriggered, ParquetReaderConfig readerConfig) {
Path summaryFile = getSummaryFileName(metadataParentDir);
Path metadataDirFile = getDirFileName(metadataParentDir);
MetadataContext metaContext = new MetadataContext();
try {
// If autoRefresh is not triggered and none of the metadata files exist
if (!autoRefreshTriggered && !metadataExists(fs, metadataParentDir)) {
logger.debug("Metadata doesn't exist in {}", metadataParentDir);
return null;
} else if (autoRefreshTriggered && !fs.exists(summaryFile)) {
logger.debug("Metadata Summary file {} does not exist", summaryFile);
return null;
} else {
// If the autorefresh is not triggered, check if the cache file is stale and trigger auto-refresh
if (!autoRefreshTriggered) {
Metadata metadata = new Metadata(readerConfig);
if (!fs.exists(metadataDirFile)) {
return null;
}
ParquetTableMetadataDirs metadataDirs = readMetadataDirs(fs, metadataDirFile, metaContext, readerConfig);
if (metadata.tableModified(metadataDirs.getDirectories(), summaryFile, metadataParentDir, metaContext, fs) && true) {
ParquetTableMetadata_v4 parquetTableMetadata = (metadata.createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(summaryFile.getParent()), fs, true, null, true)).getLeft();
return parquetTableMetadata.getSummary();
}
}
// Read the existing metadataSummary cache file to get the metadataSummary
ObjectMapper mapper = new ObjectMapper();
final SimpleModule serialModule = new SimpleModule();
serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
serialModule.addKeyDeserializer(ColumnTypeMetadata_v4.Key.class, new ColumnTypeMetadata_v4.Key.DeSerializer());
AfterburnerModule module = new AfterburnerModule();
module.setUseOptimizedBeanDeserializer(true);
mapper.registerModule(serialModule);
mapper.registerModule(module);
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
InputStream is = fs.open(summaryFile);
return mapper.readValue(is, Metadata_V4.MetadataSummary.class);
}
} catch (IOException e) {
logger.debug("Failed to read '{}' summary metadata file", summaryFile, e);
return null;
}
}
use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.
the class Metadata method getParquetTableMetadata.
/**
* Get the parquet metadata for a list of parquet files
*
* @param fileStatusMap file statuses and corresponding file systems
* @return parquet table metadata object
* @throws IOException if parquet file metadata can't be obtained
*/
private ParquetTableMetadata_v4 getParquetTableMetadata(Map<FileStatus, FileSystem> fileStatusMap) throws IOException {
Metadata_V4.MetadataSummary tableMetadataSummary = new Metadata_V4.MetadataSummary(SUPPORTED_VERSIONS.last().toString(), DrillVersionInfo.getVersion(), new ArrayList<>(), true);
ParquetTableMetadata_v4 tableMetadata = new ParquetTableMetadata_v4(tableMetadataSummary);
List<ParquetFileAndRowCountMetadata> parquetFileAndRowCountMetadata = getParquetFileMetadata_v4(tableMetadata, fileStatusMap, true, null);
List<ParquetFileMetadata_v4> parquetFileMetadata = new ArrayList<>();
for (ParquetFileAndRowCountMetadata fileAndGlobalMetadata : parquetFileAndRowCountMetadata) {
parquetFileMetadata.add(fileAndGlobalMetadata.getFileMetadata());
}
tableMetadata.assignFiles(parquetFileMetadata);
return tableMetadata;
}
use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.
the class Metadata method getParquetTableMetadata.
/**
* Get the parquet metadata for the parquet files in a directory.
*
* @param path the path of the directory
* @return metadata object for an entire parquet directory structure
* @throws IOException in case of problems during accessing files
*/
private ParquetTableMetadata_v4 getParquetTableMetadata(Path path, FileSystem fs) throws IOException {
FileStatus fileStatus = fs.getFileStatus(path);
Stopwatch watch = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
List<FileStatus> fileStatuses = new ArrayList<>();
if (fileStatus.isFile()) {
fileStatuses.add(fileStatus);
} else {
// the thing we need!?
fileStatuses.addAll(DrillFileSystemUtil.listFiles(fs, path, true));
}
if (watch != null) {
logger.debug("Took {} ms to get file statuses", watch.elapsed(TimeUnit.MILLISECONDS));
watch.reset();
watch.start();
}
Map<FileStatus, FileSystem> fileStatusMap = fileStatuses.stream().collect(java.util.stream.Collectors.toMap(Function.identity(), s -> fs, (oldFs, newFs) -> newFs, LinkedHashMap::new));
ParquetTableMetadata_v4 metadata_v4 = getParquetTableMetadata(fileStatusMap);
if (watch != null) {
logger.debug("Took {} ms to read file metadata", watch.elapsed(TimeUnit.MILLISECONDS));
watch.stop();
}
return metadata_v4;
}
use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.
the class MetadataPathUtils method convertToFilesWithAbsolutePaths.
/**
* Convert a list of files with relative paths to files with absolute ones
*
* @param files list of files with relative paths
* @param baseDir base parent directory
* @return list of files with absolute paths
*/
public static List<? extends ParquetFileMetadata> convertToFilesWithAbsolutePaths(List<? extends ParquetFileMetadata> files, String baseDir) {
if (!files.isEmpty()) {
List<ParquetFileMetadata> filesWithAbsolutePaths = new ArrayList<>();
for (ParquetFileMetadata file : files) {
Path relativePath = file.getPath();
ParquetFileMetadata fileWithAbsolutePath = null;
// create a new file if old one contains a relative path, otherwise use an old file
if (file instanceof ParquetFileMetadata_v4) {
fileWithAbsolutePath = (relativePath.isAbsolute()) ? file : new ParquetFileMetadata_v4(new Path(baseDir, relativePath), file.getLength(), (List<Metadata_V4.RowGroupMetadata_v4>) file.getRowGroups());
} else if (file instanceof ParquetFileMetadata_v3) {
fileWithAbsolutePath = (relativePath.isAbsolute()) ? file : new ParquetFileMetadata_v3(new Path(baseDir, relativePath), file.getLength(), (List<Metadata_V3.RowGroupMetadata_v3>) file.getRowGroups());
}
filesWithAbsolutePaths.add(fileWithAbsolutePath);
}
return filesWithAbsolutePaths;
}
return files;
}
use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.
the class ConvertCountToDirectScanRule method onMatch.
@Override
public void onMatch(RelOptRuleCall call) {
final Aggregate agg = call.rel(0);
final TableScan scan = call.rel(call.rels.length - 1);
final Project project = call.rels.length == 3 ? (Project) call.rel(1) : null;
// 3) Additional checks are done further below ..
if (agg.getGroupCount() > 0 || agg.containsDistinctCall()) {
return;
}
DrillTable drillTable = DrillRelOptUtil.getDrillTable(scan);
if (drillTable == null) {
logger.debug("Rule does not apply since an eligible drill table instance was not found.");
return;
}
Object selection = drillTable.getSelection();
if (!(selection instanceof FormatSelection)) {
logger.debug("Rule does not apply since only Parquet file format is eligible.");
return;
}
PlannerSettings settings = call.getPlanner().getContext().unwrap(PlannerSettings.class);
// Rule is applicable only if the statistics for row count and null count are available from the metadata,
FormatSelection formatSelection = (FormatSelection) selection;
// Rule cannot be applied if the selection had wildcard since the totalrowcount cannot be read from the parent directory
if (formatSelection.getSelection().hadWildcard()) {
logger.debug("Rule does not apply when there is a wild card since the COUNT could not be determined from metadata.");
return;
}
Pair<Boolean, Metadata_V4.MetadataSummary> status = checkMetadataForScanStats(settings, drillTable, formatSelection);
if (!status.getLeft()) {
logger.debug("Rule does not apply since MetadataSummary metadata was not found.");
return;
}
Metadata_V4.MetadataSummary metadataSummary = status.getRight();
Map<String, Long> result = collectCounts(settings, metadataSummary, agg, scan, project);
logger.trace("Calculated the following aggregate counts: {}", result);
// if counts could not be determined, rule won't be applied
if (result.isEmpty()) {
logger.debug("Rule does not apply since one or more COUNTs could not be determined from metadata.");
return;
}
Path summaryFileName = Metadata.getSummaryFileName(formatSelection.getSelection().getSelectionRoot());
final RelDataType scanRowType = CountToDirectScanUtils.constructDataType(agg, result.keySet());
final DynamicPojoRecordReader<Long> reader = new DynamicPojoRecordReader<>(CountToDirectScanUtils.buildSchema(scanRowType.getFieldNames()), Collections.singletonList(new ArrayList<>(result.values())));
final ScanStats scanStats = new ScanStats(ScanStats.GroupScanProperty.EXACT_ROW_COUNT, 1, 1, scanRowType.getFieldCount());
final MetadataDirectGroupScan directScan = new MetadataDirectGroupScan(reader, summaryFileName, 1, scanStats, true, false);
final DrillDirectScanRel newScan = new DrillDirectScanRel(scan.getCluster(), scan.getTraitSet().plus(DrillRel.DRILL_LOGICAL), directScan, scanRowType);
final DrillProjectRel newProject = new DrillProjectRel(agg.getCluster(), agg.getTraitSet().plus(DrillRel.DRILL_LOGICAL), newScan, CountToDirectScanUtils.prepareFieldExpressions(scanRowType), agg.getRowType());
call.transformTo(newProject);
}
Aggregations