Search in sources :

Example 1 with Metadata

use of org.apache.drill.exec.store.parquet.metadata.Metadata in project drill by apache.

the class ParquetTableMetadataProviderImpl method initInternal.

@Override
protected void initInternal() throws IOException {
    try (FileSystem processUserFileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), fs.getConf())) {
        // Depending on the version of metadata this may represent more than 1 metadata file paths.
        List<Path> metaPaths = new ArrayList<>();
        if (entries.size() == 1 && parquetTableMetadata == null) {
            Path p = Path.getPathWithoutSchemeAndAuthority(entries.get(0).getPath());
            if (fs.isDirectory(p)) {
                // Using the metadata file makes sense when querying a directory; otherwise
                // if querying a single file we can look up the metadata directly from the file
                metaPaths = populateMetaPaths(p, fs);
            }
            if (!metaContext.isMetadataCacheCorrupted() && !metaPaths.isEmpty()) {
                parquetTableMetadata = Metadata.readBlockMeta(processUserFileSystem, metaPaths, metaContext, readerConfig);
                if (parquetTableMetadata != null) {
                    usedMetadataCache = true;
                }
            }
            if (!usedMetadataCache) {
                parquetTableMetadata = Metadata.getParquetTableMetadata(processUserFileSystem, p, readerConfig);
            }
        } else {
            Path p = Path.getPathWithoutSchemeAndAuthority(selectionRoot);
            metaPaths = populateMetaPaths(p, fs);
            if (!metaContext.isMetadataCacheCorrupted() && fs.isDirectory(selectionRoot) && !metaPaths.isEmpty()) {
                if (parquetTableMetadata == null) {
                    parquetTableMetadata = Metadata.readBlockMeta(processUserFileSystem, metaPaths, metaContext, readerConfig);
                }
                if (parquetTableMetadata != null) {
                    usedMetadataCache = true;
                    if (fileSet != null) {
                        parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata);
                    }
                }
            }
            if (!usedMetadataCache) {
                final List<FileStatus> fileStatuses = new ArrayList<>();
                for (ReadEntryWithPath entry : entries) {
                    fileStatuses.addAll(DrillFileSystemUtil.listFiles(fs, Path.getPathWithoutSchemeAndAuthority(entry.getPath()), true));
                }
                Map<FileStatus, FileSystem> statusMap = fileStatuses.stream().collect(Collectors.toMap(Function.identity(), s -> processUserFileSystem, (oldFs, newFs) -> newFs, LinkedHashMap::new));
                parquetTableMetadata = Metadata.getParquetTableMetadata(statusMap, readerConfig);
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataProviderManager(org.apache.drill.exec.metastore.MetadataProviderManager) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) LoggerFactory(org.slf4j.LoggerFactory) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) ImpersonationUtil(org.apache.drill.exec.util.ImpersonationUtil) Path(org.apache.hadoop.fs.Path) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) Logger(org.slf4j.Logger) BaseParquetMetadataProvider(org.apache.drill.exec.store.parquet.BaseParquetMetadataProvider) IOException(java.io.IOException) MetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase) Collectors(java.util.stream.Collectors) MetadataContext(org.apache.drill.exec.store.dfs.MetadataContext) DrillFileSystemUtil(org.apache.drill.exec.util.DrillFileSystemUtil) List(java.util.List) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) Collections(java.util.Collections) Metadata(org.apache.drill.exec.store.parquet.metadata.Metadata) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) ArrayList(java.util.ArrayList)

Example 2 with Metadata

use of org.apache.drill.exec.store.parquet.metadata.Metadata in project drill by apache.

the class ParquetTableMetadataProviderImpl method populateMetaPaths.

/**
 * Returns list of metadata cache files
 * @param p directory path of the cache file
 * @param fs filesystem object
 * @return list of cache files found in the given directory path
 */
public List<Path> populateMetaPaths(Path p, DrillFileSystem fs) throws IOException {
    if (fs.isDirectory(p)) {
        List<Path> metaFilepaths = Arrays.stream(Metadata.CURRENT_METADATA_FILENAMES).map(filename -> new Path(p, filename)).collect(Collectors.toList());
        for (String filename : Metadata.OLD_METADATA_FILENAMES) {
            // Read the older version of metadata file if the current version of metadata cache files does not exist.
            if (fileExists(fs, metaFilepaths)) {
                return metaFilepaths;
            }
            metaFilepaths.clear();
            metaFilepaths.add(new Path(p, filename));
        }
        if (fileExists(fs, metaFilepaths)) {
            return metaFilepaths;
        }
    }
    return Collections.emptyList();
}
Also used : Path(org.apache.hadoop.fs.Path) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataProviderManager(org.apache.drill.exec.metastore.MetadataProviderManager) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) LoggerFactory(org.slf4j.LoggerFactory) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) ImpersonationUtil(org.apache.drill.exec.util.ImpersonationUtil) Path(org.apache.hadoop.fs.Path) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) Logger(org.slf4j.Logger) BaseParquetMetadataProvider(org.apache.drill.exec.store.parquet.BaseParquetMetadataProvider) IOException(java.io.IOException) MetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase) Collectors(java.util.stream.Collectors) MetadataContext(org.apache.drill.exec.store.dfs.MetadataContext) DrillFileSystemUtil(org.apache.drill.exec.util.DrillFileSystemUtil) List(java.util.List) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) Collections(java.util.Collections) Metadata(org.apache.drill.exec.store.parquet.metadata.Metadata)

Aggregations

IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Arrays (java.util.Arrays)2 Collections (java.util.Collections)2 HashSet (java.util.HashSet)2 LinkedHashMap (java.util.LinkedHashMap)2 List (java.util.List)2 Map (java.util.Map)2 Function (java.util.function.Function)2 Collectors (java.util.stream.Collectors)2 MetadataProviderManager (org.apache.drill.exec.metastore.MetadataProviderManager)2 DrillFileSystem (org.apache.drill.exec.store.dfs.DrillFileSystem)2 FileSelection (org.apache.drill.exec.store.dfs.FileSelection)2 MetadataContext (org.apache.drill.exec.store.dfs.MetadataContext)2 ReadEntryWithPath (org.apache.drill.exec.store.dfs.ReadEntryWithPath)2 BaseParquetMetadataProvider (org.apache.drill.exec.store.parquet.BaseParquetMetadataProvider)2 ParquetReaderUtility (org.apache.drill.exec.store.parquet.ParquetReaderUtility)2 Metadata (org.apache.drill.exec.store.parquet.metadata.Metadata)2 MetadataBase (org.apache.drill.exec.store.parquet.metadata.MetadataBase)2 DrillFileSystemUtil (org.apache.drill.exec.util.DrillFileSystemUtil)2