Search in sources :

Example 1 with ReadEntryWithPath

use of org.apache.drill.exec.store.dfs.ReadEntryWithPath in project drill by apache.

the class ParquetGroupScan method modifyFileSelection.

@Override
public void modifyFileSelection(FileSelection selection) {
    entries.clear();
    fileSet = Sets.newHashSet();
    for (String fileName : selection.getFiles()) {
        entries.add(new ReadEntryWithPath(fileName));
        fileSet.add(fileName);
    }
    List<RowGroupInfo> newRowGroupList = Lists.newArrayList();
    for (RowGroupInfo rowGroupInfo : rowGroupInfos) {
        if (fileSet.contains(rowGroupInfo.getPath())) {
            newRowGroupList.add(rowGroupInfo);
        }
    }
    this.rowGroupInfos = newRowGroupList;
}
Also used : ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath)

Example 2 with ReadEntryWithPath

use of org.apache.drill.exec.store.dfs.ReadEntryWithPath in project drill by axbaretto.

the class ParquetGroupScan method modifyFileSelection.

@Override
public void modifyFileSelection(FileSelection selection) {
    entries.clear();
    fileSet = Sets.newHashSet();
    for (String fileName : selection.getFiles()) {
        entries.add(new ReadEntryWithPath(fileName));
        fileSet.add(fileName);
    }
    List<RowGroupInfo> newRowGroupList = Lists.newArrayList();
    for (RowGroupInfo rowGroupInfo : rowGroupInfos) {
        if (fileSet.contains(rowGroupInfo.getPath())) {
            newRowGroupList.add(rowGroupInfo);
        }
    }
    this.rowGroupInfos = newRowGroupList;
}
Also used : ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath)

Example 3 with ReadEntryWithPath

use of org.apache.drill.exec.store.dfs.ReadEntryWithPath in project drill by apache.

the class HiveParquetTableMetadataProvider method initInternal.

@Override
protected void initInternal() throws IOException {
    Map<FileStatus, FileSystem> fileStatusConfMap = new LinkedHashMap<>();
    for (ReadEntryWithPath entry : entries) {
        Path path = entry.getPath();
        Configuration conf = new ProjectionPusher().pushProjectionsAndFilters(new JobConf(hiveStoragePlugin.getHiveConf()), path.getParent());
        FileSystem fs = path.getFileSystem(conf);
        fileStatusConfMap.put(fs.getFileStatus(Path.getPathWithoutSchemeAndAuthority(path)), fs);
    }
    parquetTableMetadata = Metadata.getParquetTableMetadata(fileStatusConfMap, readerConfig);
}
Also used : Path(org.apache.hadoop.fs.Path) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ProjectionPusher(org.apache.hadoop.hive.ql.io.parquet.ProjectionPusher) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) LinkedHashMap(java.util.LinkedHashMap)

Example 4 with ReadEntryWithPath

use of org.apache.drill.exec.store.dfs.ReadEntryWithPath in project drill by apache.

the class ParquetTableMetadataProviderImpl method initInternal.

@Override
protected void initInternal() throws IOException {
    try (FileSystem processUserFileSystem = ImpersonationUtil.createFileSystem(ImpersonationUtil.getProcessUserName(), fs.getConf())) {
        // Depending on the version of metadata this may represent more than 1 metadata file paths.
        List<Path> metaPaths = new ArrayList<>();
        if (entries.size() == 1 && parquetTableMetadata == null) {
            Path p = Path.getPathWithoutSchemeAndAuthority(entries.get(0).getPath());
            if (fs.isDirectory(p)) {
                // Using the metadata file makes sense when querying a directory; otherwise
                // if querying a single file we can look up the metadata directly from the file
                metaPaths = populateMetaPaths(p, fs);
            }
            if (!metaContext.isMetadataCacheCorrupted() && !metaPaths.isEmpty()) {
                parquetTableMetadata = Metadata.readBlockMeta(processUserFileSystem, metaPaths, metaContext, readerConfig);
                if (parquetTableMetadata != null) {
                    usedMetadataCache = true;
                }
            }
            if (!usedMetadataCache) {
                parquetTableMetadata = Metadata.getParquetTableMetadata(processUserFileSystem, p, readerConfig);
            }
        } else {
            Path p = Path.getPathWithoutSchemeAndAuthority(selectionRoot);
            metaPaths = populateMetaPaths(p, fs);
            if (!metaContext.isMetadataCacheCorrupted() && fs.isDirectory(selectionRoot) && !metaPaths.isEmpty()) {
                if (parquetTableMetadata == null) {
                    parquetTableMetadata = Metadata.readBlockMeta(processUserFileSystem, metaPaths, metaContext, readerConfig);
                }
                if (parquetTableMetadata != null) {
                    usedMetadataCache = true;
                    if (fileSet != null) {
                        parquetTableMetadata = removeUnneededRowGroups(parquetTableMetadata);
                    }
                }
            }
            if (!usedMetadataCache) {
                final List<FileStatus> fileStatuses = new ArrayList<>();
                for (ReadEntryWithPath entry : entries) {
                    fileStatuses.addAll(DrillFileSystemUtil.listFiles(fs, Path.getPathWithoutSchemeAndAuthority(entry.getPath()), true));
                }
                Map<FileStatus, FileSystem> statusMap = fileStatuses.stream().collect(Collectors.toMap(Function.identity(), s -> processUserFileSystem, (oldFs, newFs) -> newFs, LinkedHashMap::new));
                parquetTableMetadata = Metadata.getParquetTableMetadata(statusMap, readerConfig);
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataProviderManager(org.apache.drill.exec.metastore.MetadataProviderManager) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) LoggerFactory(org.slf4j.LoggerFactory) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) ImpersonationUtil(org.apache.drill.exec.util.ImpersonationUtil) Path(org.apache.hadoop.fs.Path) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) Logger(org.slf4j.Logger) BaseParquetMetadataProvider(org.apache.drill.exec.store.parquet.BaseParquetMetadataProvider) IOException(java.io.IOException) MetadataBase(org.apache.drill.exec.store.parquet.metadata.MetadataBase) Collectors(java.util.stream.Collectors) MetadataContext(org.apache.drill.exec.store.dfs.MetadataContext) DrillFileSystemUtil(org.apache.drill.exec.util.DrillFileSystemUtil) List(java.util.List) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) Collections(java.util.Collections) Metadata(org.apache.drill.exec.store.parquet.metadata.Metadata) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) ArrayList(java.util.ArrayList)

Example 5 with ReadEntryWithPath

use of org.apache.drill.exec.store.dfs.ReadEntryWithPath in project drill by apache.

the class AbstractParquetGroupScan method modifyFileSelection.

// limit push down methods end
// helper method used for partition pruning and filter push down
@Override
public void modifyFileSelection(FileSelection selection) {
    super.modifyFileSelection(selection);
    List<Path> files = selection.getFiles();
    fileSet = new HashSet<>(files);
    entries = new ArrayList<>(files.size());
    entries.addAll(files.stream().map(ReadEntryWithPath::new).collect(Collectors.toList()));
    Multimap<Path, RowGroupMetadata> newRowGroups = LinkedListMultimap.create();
    if (!getRowGroupsMetadata().isEmpty()) {
        getRowGroupsMetadata().entries().stream().filter(entry -> fileSet.contains(entry.getKey())).forEachOrdered(entry -> newRowGroups.put(entry.getKey(), entry.getValue()));
    }
    this.rowGroups = newRowGroups;
    tableMetadata = TableMetadataUtils.updateRowCount(getTableMetadata(), getRowGroupsMetadata().values());
    if (!getFilesMetadata().isEmpty()) {
        this.files = getFilesMetadata().entrySet().stream().filter(entry -> fileSet.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    } else {
        this.files = Collections.emptyMap();
    }
    List<PartitionMetadata> newPartitions = new ArrayList<>();
    if (!getPartitionsMetadata().isEmpty()) {
        for (PartitionMetadata entry : getPartitionsMetadata()) {
            for (Path partLocation : entry.getLocations()) {
                if (fileSet.contains(partLocation)) {
                    newPartitions.add(entry);
                    break;
                }
            }
        }
    }
    partitions = newPartitions;
    if (!getSegmentsMetadata().isEmpty()) {
        this.segments = getSegmentsMetadata().entrySet().stream().filter(entry -> fileSet.contains(entry.getKey())).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    }
    rowGroupInfos = null;
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataType(org.apache.drill.metastore.metadata.MetadataType) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) LoggerFactory(org.slf4j.LoggerFactory) ArrayListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap) FunctionImplementationRegistry(org.apache.drill.exec.expr.fn.FunctionImplementationRegistry) ExpressionStringBuilder(org.apache.drill.common.expression.ExpressionStringBuilder) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) ListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ListMultimap) UdfUtilities(org.apache.drill.exec.ops.UdfUtilities) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) Collection(java.util.Collection) SegmentMetadata(org.apache.drill.metastore.metadata.SegmentMetadata) SchemaPath(org.apache.drill.common.expression.SchemaPath) Set(java.util.Set) LogicalExpression(org.apache.drill.common.expression.LogicalExpression) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) AbstractGroupScanWithMetadata(org.apache.drill.exec.physical.base.AbstractGroupScanWithMetadata) List(java.util.List) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) MetadataProviderManager(org.apache.drill.exec.metastore.MetadataProviderManager) TableMetadataUtils(org.apache.drill.metastore.util.TableMetadataUtils) FilterPredicate(org.apache.drill.exec.expr.FilterPredicate) OptionManager(org.apache.drill.exec.server.options.OptionManager) HashMap(java.util.HashMap) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) CoordinationProtos(org.apache.drill.exec.proto.CoordinationProtos) AffinityCreator(org.apache.drill.exec.store.schedule.AffinityCreator) EndpointByteMapImpl(org.apache.drill.exec.store.schedule.EndpointByteMapImpl) CollectionUtils(org.apache.commons.collections.CollectionUtils) JsonIgnore(com.fasterxml.jackson.annotation.JsonIgnore) EndpointByteMap(org.apache.drill.exec.store.schedule.EndpointByteMap) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) FileMetadata(org.apache.drill.metastore.metadata.FileMetadata) BaseMetadata(org.apache.drill.metastore.metadata.BaseMetadata) Logger(org.slf4j.Logger) MapUtils(org.apache.commons.collections.MapUtils) ExactStatisticsConstants(org.apache.drill.metastore.statistics.ExactStatisticsConstants) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata) IOException(java.io.IOException) ParquetMetadataProvider(org.apache.drill.exec.metastore.store.parquet.ParquetMetadataProvider) LinkedListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.LinkedListMultimap) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) Statistic(org.apache.drill.metastore.statistics.Statistic) LocationProvider(org.apache.drill.metastore.metadata.LocationProvider) EndpointAffinity(org.apache.drill.exec.physical.EndpointAffinity) GroupScan(org.apache.drill.exec.physical.base.GroupScan) JsonInclude(com.fasterxml.jackson.annotation.JsonInclude) Collections(java.util.Collections) ParquetMetadataProviderBuilder(org.apache.drill.exec.metastore.store.parquet.ParquetMetadataProviderBuilder) AssignmentCreator(org.apache.drill.exec.store.schedule.AssignmentCreator) ReadEntryWithPath(org.apache.drill.exec.store.dfs.ReadEntryWithPath) ArrayList(java.util.ArrayList) PartitionMetadata(org.apache.drill.metastore.metadata.PartitionMetadata) RowGroupMetadata(org.apache.drill.metastore.metadata.RowGroupMetadata)

Aggregations

ReadEntryWithPath (org.apache.drill.exec.store.dfs.ReadEntryWithPath)7 Path (org.apache.hadoop.fs.Path)5 FileStatus (org.apache.hadoop.fs.FileStatus)4 LinkedHashMap (java.util.LinkedHashMap)3 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Collections (java.util.Collections)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Function (java.util.function.Function)2 Collectors (java.util.stream.Collectors)2 MetadataProviderManager (org.apache.drill.exec.metastore.MetadataProviderManager)2 DrillbitEndpoint (org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint)2 FileSelection (org.apache.drill.exec.store.dfs.FileSelection)2 ParquetFileMetadata (org.apache.drill.exec.store.parquet.Metadata.ParquetFileMetadata)2 RowGroupMetadata (org.apache.drill.exec.store.parquet.Metadata.RowGroupMetadata)2 EndpointByteMap (org.apache.drill.exec.store.schedule.EndpointByteMap)2 EndpointByteMapImpl (org.apache.drill.exec.store.schedule.EndpointByteMapImpl)2