Search in sources :

Example 6 with Multimap

use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.

the class HiveMetadataProvider method splitInputWithUGI.

/**
 * Gets list of input splits based on table location.
 * These input splits are grouped logically by file name
 * if skip header / footer logic should be applied later on.
 *
 * @param properties table or partition properties
 * @param sd storage descriptor
 * @param partition hive partition
 * @return list of logically grouped input splits
 */
private List<LogicalInputSplit> splitInputWithUGI(final Properties properties, final StorageDescriptor sd, final Partition partition) {
    watch.start();
    try {
        return ugi.doAs((PrivilegedExceptionAction<List<LogicalInputSplit>>) () -> {
            final List<LogicalInputSplit> splits = new ArrayList<>();
            final JobConf job = new JobConf(hiveConf);
            HiveUtilities.addConfToJob(job, properties);
            HiveUtilities.verifyAndAddTransactionalProperties(job, sd);
            job.setInputFormat(HiveUtilities.getInputFormatClass(job, sd, hiveReadEntry.getTable()));
            final Path path = new Path(sd.getLocation());
            final FileSystem fs = path.getFileSystem(job);
            if (fs.exists(path)) {
                FileInputFormat.addInputPath(job, path);
                final InputFormat<?, ?> format = job.getInputFormat();
                InputSplit[] inputSplits = format.getSplits(job, 1);
                // we need to make sure that splits of the same file are grouped together
                if (TextInputFormat.class.getCanonicalName().equals(sd.getInputFormat()) && HiveUtilities.hasHeaderOrFooter(hiveReadEntry.getTable())) {
                    Multimap<Path, FileSplit> inputSplitMultimap = transformFileSplits(inputSplits);
                    for (Collection<FileSplit> logicalInputSplit : inputSplitMultimap.asMap().values()) {
                        splits.add(new LogicalInputSplit(logicalInputSplit, partition));
                    }
                } else {
                    for (final InputSplit split : inputSplits) {
                        splits.add(new LogicalInputSplit(split, partition));
                    }
                }
            }
            return splits;
        });
    } catch (final InterruptedException | IOException e) {
        final String errMsg = String.format("Failed to create input splits: %s", e.getMessage());
        logger.error(errMsg, e);
        throw new DrillRuntimeException(errMsg, e);
    } finally {
        logger.trace("Took {} µs to get splits from {}", watch.elapsed(TimeUnit.NANOSECONDS) / 1000, sd.getLocation());
        watch.stop();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) TreeMultimap(org.apache.drill.shaded.guava.com.google.common.collect.TreeMultimap) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) Collection(java.util.Collection) ArrayList(java.util.ArrayList) List(java.util.List) DrillRuntimeException(org.apache.drill.common.exceptions.DrillRuntimeException) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 7 with Multimap

use of org.apache.drill.shaded.guava.com.google.common.collect.Multimap in project drill by apache.

the class FileMetadataInfoCollector method init.

private void init(FormatSelection selection, PlannerSettings settings, Supplier<TableScan> tableScanSupplier, List<SchemaPath> interestingColumns, int segmentColumnsCount) throws IOException {
    List<SchemaPath> metastoreInterestingColumns = Optional.ofNullable(basicRequests.interestingColumnsAndPartitionKeys(tableInfo).interestingColumns()).map(metastoreInterestingColumnNames -> metastoreInterestingColumnNames.stream().map(SchemaPath::parseFromString).collect(Collectors.toList())).orElse(null);
    Map<String, Long> filesNamesLastModifiedTime = basicRequests.filesLastModifiedTime(tableInfo, null, null);
    List<String> newFiles = new ArrayList<>();
    List<String> updatedFiles = new ArrayList<>();
    List<String> removedFiles = new ArrayList<>(filesNamesLastModifiedTime.keySet());
    List<String> allFiles = new ArrayList<>();
    for (FileStatus fileStatus : getFileStatuses(selection)) {
        String path = Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath()).toUri().getPath();
        Long lastModificationTime = filesNamesLastModifiedTime.get(path);
        if (lastModificationTime == null) {
            newFiles.add(path);
        } else if (lastModificationTime < fileStatus.getModificationTime()) {
            updatedFiles.add(path);
        }
        removedFiles.remove(path);
        allFiles.add(path);
    }
    String selectionRoot = selection.getSelection().getSelectionRoot().toUri().getPath();
    if (!Objects.equals(metastoreInterestingColumns, interestingColumns) && metastoreInterestingColumns != null && (interestingColumns == null || !metastoreInterestingColumns.containsAll(interestingColumns)) || TableStatisticsKind.ANALYZE_METADATA_LEVEL.getValue(basicRequests.tableMetadata(tableInfo)).compareTo(metadataLevel) != 0) {
        // do not update table scan and lists of segments / files / row groups,
        // metadata should be recalculated
        tableScan = tableScanSupplier.get();
        metadataToRemove.addAll(getMetadataInfoList(selectionRoot, removedFiles, MetadataType.SEGMENT, 0));
        return;
    }
    // checks whether there are no new, updated and removed files
    if (!newFiles.isEmpty() || !updatedFiles.isEmpty() || !removedFiles.isEmpty()) {
        List<String> scanFiles = new ArrayList<>(newFiles);
        scanFiles.addAll(updatedFiles);
        // updates scan to read updated / new files
        tableScan = getTableScan(settings, tableScanSupplier.get(), scanFiles);
        // iterates from the end;
        // takes deepest updated segments;
        // finds their parents:
        // - fetches all segments for parent level;
        // - filters segments to leave parents only;
        // obtains all child segments;
        // filters child segments for filtered parent segments
        int lastSegmentIndex = segmentColumnsCount - 1;
        List<String> scanAndRemovedFiles = new ArrayList<>(scanFiles);
        scanAndRemovedFiles.addAll(removedFiles);
        // 1. Obtain files info for files from the same folder without removed files
        // 2. Get segments for obtained files + segments for removed files
        // 3. Get parent segments
        // 4. Get other segments for the same parent segment
        // 5. Remove segments which have only removed files (matched for removedFileInfo and don't match to filesInfo)
        // 6. Do the same for parent segments
        List<MetadataInfo> allFilesInfo = getMetadataInfoList(selectionRoot, allFiles, MetadataType.FILE, 0);
        // first pass: collect updated segments even without files, they will be removed later
        List<MetadataInfo> leafSegments = getMetadataInfoList(selectionRoot, scanAndRemovedFiles, MetadataType.SEGMENT, lastSegmentIndex);
        List<MetadataInfo> removedFilesMetadata = getMetadataInfoList(selectionRoot, removedFiles, MetadataType.FILE, 0);
        List<MetadataInfo> scanFilesInfo = getMetadataInfoList(selectionRoot, scanAndRemovedFiles, MetadataType.FILE, 0);
        // files from scan + files from the same folder without removed files
        filesInfo = leafSegments.stream().filter(parent -> scanFilesInfo.stream().anyMatch(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier()))).flatMap(parent -> allFilesInfo.stream().filter(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier()))).collect(Collectors.toList());
        Multimap<Integer, MetadataInfo> allSegments = populateSegments(removedFiles, allFiles, selectionRoot, lastSegmentIndex, leafSegments, removedFilesMetadata);
        List<MetadataInfo> allRowGroupsInfo = getAllRowGroupsMetadataInfos(allFiles);
        rowGroupsInfo = allRowGroupsInfo.stream().filter(child -> filesInfo.stream().map(MetadataInfo::identifier).anyMatch(parent -> MetadataIdentifierUtils.isMetadataKeyParent(parent, child.identifier()))).collect(Collectors.toList());
        List<MetadataInfo> segmentsToUpdate = getMetadataInfoList(selectionRoot, scanAndRemovedFiles, MetadataType.SEGMENT, 0);
        allMetaToHandle = Streams.concat(allSegments.values().stream(), allFilesInfo.stream(), allRowGroupsInfo.stream()).filter(child -> segmentsToUpdate.stream().anyMatch(parent -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier()))).filter(parent -> removedFilesMetadata.stream().noneMatch(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier())) || filesInfo.stream().anyMatch(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier()))).collect(Collectors.toList());
        // removed top-level segments are handled separately since their metadata is not overridden when producing writing to the Metastore
        List<MetadataInfo> removedTopSegments = getMetadataInfoList(selectionRoot, removedFiles, MetadataType.SEGMENT, 0).stream().filter(parent -> removedFilesMetadata.stream().anyMatch(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier())) && allFilesInfo.stream().noneMatch(child -> MetadataIdentifierUtils.isMetadataKeyParent(parent.identifier(), child.identifier()))).collect(Collectors.toList());
        metadataToRemove.addAll(removedTopSegments);
        segmentsToUpdate.stream().filter(segment -> !removedTopSegments.contains(segment)).forEach(allMetaToHandle::add);
    } else {
        // table metadata may still be actual
        outdated = false;
    }
}
Also used : SchemalessScan(org.apache.drill.exec.physical.base.SchemalessScan) MetadataType(org.apache.drill.metastore.metadata.MetadataType) TableScan(org.apache.calcite.rel.core.TableScan) Arrays(java.util.Arrays) TableInfo(org.apache.drill.metastore.metadata.TableInfo) MetastoreColumn(org.apache.drill.metastore.MetastoreColumn) FileSystem(org.apache.hadoop.fs.FileSystem) DrillRel(org.apache.drill.exec.planner.logical.DrillRel) Streams(org.apache.drill.shaded.guava.com.google.common.collect.Streams) DrillScanRel(org.apache.drill.exec.planner.logical.DrillScanRel) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) ArrayListMultimap(org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap) FileStatus(org.apache.hadoop.fs.FileStatus) DrillTable(org.apache.drill.exec.planner.logical.DrillTable) ColumnExplorer(org.apache.drill.exec.store.ColumnExplorer) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) FormatSelection(org.apache.drill.exec.store.dfs.FormatSelection) ImpersonationUtil(org.apache.drill.exec.util.ImpersonationUtil) Path(org.apache.hadoop.fs.Path) FileSelection(org.apache.drill.exec.store.dfs.FileSelection) Multimap(org.apache.drill.shaded.guava.com.google.common.collect.Multimap) TableStatisticsKind(org.apache.drill.metastore.statistics.TableStatisticsKind) PartitionLocation(org.apache.drill.exec.planner.PartitionLocation) BasicTablesRequests(org.apache.drill.metastore.components.tables.BasicTablesRequests) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) DrillFileSystemUtil(org.apache.drill.exec.util.DrillFileSystemUtil) List(java.util.List) Lists(org.apache.drill.shaded.guava.com.google.common.collect.Lists) FileSystemPartitionDescriptor(org.apache.drill.exec.planner.FileSystemPartitionDescriptor) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) Optional(java.util.Optional) Collections(java.util.Collections) MetadataInfo(org.apache.drill.metastore.metadata.MetadataInfo) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) SchemaPath(org.apache.drill.common.expression.SchemaPath)

Aggregations

Multimap (org.apache.drill.shaded.guava.com.google.common.collect.Multimap)7 IOException (java.io.IOException)6 List (java.util.List)6 ArrayList (java.util.ArrayList)5 Map (java.util.Map)5 Collectors (java.util.stream.Collectors)5 SchemaPath (org.apache.drill.common.expression.SchemaPath)5 PlannerSettings (org.apache.drill.exec.planner.physical.PlannerSettings)5 MetadataType (org.apache.drill.metastore.metadata.MetadataType)5 ArrayListMultimap (org.apache.drill.shaded.guava.com.google.common.collect.ArrayListMultimap)5 Logger (org.slf4j.Logger)5 LoggerFactory (org.slf4j.LoggerFactory)5 Collection (java.util.Collection)4 Collections (java.util.Collections)4 Path (org.apache.hadoop.fs.Path)4 HashSet (java.util.HashSet)3 Function (java.util.function.Function)3 TableStatisticsKind (org.apache.drill.metastore.statistics.TableStatisticsKind)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Arrays (java.util.Arrays)2