Search in sources :

Example 1 with RemoteIteratorWithFilter

use of org.apache.hadoop.hive.metastore.utils.FileUtils.RemoteIteratorWithFilter in project hive by apache.

the class UpgradeTool method handleRenameFiles.

/**
 * assumes https://issues.apache.org/jira/browse/HIVE-19750 is in
 * How does this work with Storage Based Auth?
 * @param p partition root or table root if not partitioned
 */
static void handleRenameFiles(Table t, Path p, boolean execute, Configuration conf, boolean isBucketed, PrintWriter pw) throws IOException {
    if (isBucketed) {
        /* For bucketed tables we assume that Hive wrote them and 0000M_0 and 0000M_0_copy_8
      are the only possibilities.  Since we can't move files across buckets the only thing we
      can do is put 0000M_0_copy_N into delta_N_N as 0000M_0.

      If M > 4096 - should error out - better yet, make this table external one - can those
      be bucketed?  don't think so
      */
        // Known deltas
        Map<Integer, List<Path>> deltaToFileMap = new HashMap<>();
        FileSystem fs = FileSystem.get(conf);
        RemoteIteratorWithFilter iter = new RemoteIteratorWithFilter(fs.listFiles(p, true), RemoteIteratorWithFilter.HIDDEN_FILES_FULL_PATH_FILTER);
        Function<Integer, List<Path>> makeList = new // lambda?
        Function<Integer, List<Path>>() {

            @Override
            public List<Path> apply(Integer aVoid) {
                return new ArrayList<>();
            }
        };
        while (iter.hasNext()) {
            LocatedFileStatus lfs = iter.next();
            if (lfs.isDirectory()) {
                String msg = Warehouse.getQualifiedName(t) + " is bucketed and has a subdirectory: " + lfs.getPath();
                LOG.error(msg);
                throw new IllegalStateException(msg);
            }
            AcidUtils.BucketMetaData bmd = AcidUtils.BucketMetaData.parse(lfs.getPath());
            if (bmd.bucketId < 0) {
                // non-standard file name - don't know what bucket the rows belong to and we can't
                // rename the file so tha it may end up treated like a different bucket id
                String msg = "Bucketed table " + Warehouse.getQualifiedName(t) + " contains file " + lfs.getPath() + " with non-standard name";
                LOG.error(msg);
                throw new IllegalArgumentException(msg);
            } else {
                if (bmd.bucketId > BucketCodec.MAX_BUCKET_ID) {
                    String msg = "Bucketed table " + Warehouse.getQualifiedName(t) + " contains file " + lfs.getPath() + " with bucketId=" + bmd.bucketId + " that is out of range";
                    LOG.error(msg);
                    throw new IllegalArgumentException(msg);
                }
                if (bmd.copyNumber > 0) {
                    deltaToFileMap.computeIfAbsent(bmd.copyNumber, makeList).add(lfs.getPath());
                }
            }
        }
        if (!deltaToFileMap.isEmpty()) {
            println(pw, "#Begin file renames for bucketed table " + Warehouse.getQualifiedName(t));
        }
        for (Map.Entry<Integer, List<Path>> ent : deltaToFileMap.entrySet()) {
            /* create delta and move each files to it.  HIVE-19750 ensures wer have reserved
         * enough write IDs to do this.*/
            Path deltaDir = new Path(p, AcidUtils.deltaSubdir(ent.getKey(), ent.getKey()));
            if (execute) {
                if (!fs.mkdirs(deltaDir)) {
                    String msg = "Failed to create directory " + deltaDir;
                    LOG.error(msg);
                    throw new IllegalStateException(msg);
                }
            }
            // Add to list of FS commands
            makeDirectoryCommand(deltaDir, pw);
            for (Path file : ent.getValue()) {
                Path newFile = new Path(deltaDir, stripCopySuffix(file.getName()));
                LOG.debug("need to rename: " + file + " to " + newFile);
                if (fs.exists(newFile)) {
                    String msg = Warehouse.getQualifiedName(t) + ": " + newFile + " already exists?!";
                    LOG.error(msg);
                    throw new IllegalStateException(msg);
                }
                if (execute) {
                    if (!fs.rename(file, newFile)) {
                        String msg = Warehouse.getQualifiedName(t) + ": " + newFile + ": failed to rename";
                        LOG.error(msg);
                        throw new IllegalStateException(msg);
                    }
                }
                // do this with and w/o execute to know what was done
                makeRenameCommand(file, newFile, pw);
            }
        }
        if (!deltaToFileMap.isEmpty()) {
            println(pw, "#End file renames for bucketed table " + Warehouse.getQualifiedName(t));
        }
        return;
    }
    List<RenamePair> renames = new ArrayList<>();
    FileSystem fs = FileSystem.get(conf);
    RemoteIteratorWithFilter iter = new RemoteIteratorWithFilter(fs.listFiles(p, true), RemoteIteratorWithFilter.HIDDEN_FILES_FULL_PATH_FILTER);
    /**
     * count some heuristics - bad file is something not in {@link AcidUtils#ORIGINAL_PATTERN} or
     * {@link AcidUtils#ORIGINAL_PATTERN_COPY} format.  This has to be renamed for acid to work.
     */
    int numBadFileNames = 0;
    /**
     * count some heuristics - num files in {@link AcidUtils#ORIGINAL_PATTERN_COPY} format.  These
     * are supported but if there are a lot of them there will be a perf hit on read until
     * major compaction
     */
    int numCopyNFiles = 0;
    // ordinal of the file in the iterator
    int fileId = 0;
    long numBytesInPartition = getDataSize(p, conf);
    int numBuckets = guessNumBuckets(numBytesInPartition);
    while (iter.hasNext()) {
        LocatedFileStatus lfs = iter.next();
        if (lfs.isDirectory()) {
            continue;
        }
        AcidUtils.BucketMetaData bmd = AcidUtils.BucketMetaData.parse(lfs.getPath());
        if (bmd.bucketId < 0) {
            numBadFileNames++;
        }
        if (bmd.copyNumber > 0) {
            // todo: what about same file name in subdir like Union All?  ROW_ID generation will handle it
            // but will have to look at ORC footers - treat these as copyN files?
            numCopyNFiles++;
        }
        // start with delta_1 (not delta_0)
        int writeId = fileId / numBuckets + 1;
        Path deltaDir = new Path(p, AcidUtils.deltaSubdir(writeId, writeId));
        if (execute) {
            if (!fs.mkdirs(deltaDir)) {
                String msg = "Failed to create directory " + deltaDir;
                LOG.error(msg);
                throw new IllegalStateException(msg);
            }
        }
        // Add to list of FS commands
        makeDirectoryCommand(deltaDir, pw);
        Path newPath = new Path(deltaDir, String.format(AcidUtils.BUCKET_DIGITS, fileId % numBuckets) + "_0");
        /*we could track reason for rename in RenamePair so that the decision can be made later to
       rename or not.  For example, if we need to minimize renames (say we are on S3), then we'd
        only rename if it's absolutely required, i.e. if it's a 'bad file name'*/
        renames.add(new RenamePair(lfs.getPath(), newPath));
        fileId++;
    }
    if (numBadFileNames <= 0 && numCopyNFiles <= 0) {
        // help 3.0 Compactor generated more balanced splits
        return;
    }
    if (!renames.isEmpty()) {
        println(pw, "#Begin file renames for unbucketed table " + Warehouse.getQualifiedName(t));
    }
    for (RenamePair renamePair : renames) {
        LOG.debug("need to rename: " + renamePair.getOldPath() + " to " + renamePair.getNewPath());
        if (fs.exists(renamePair.getNewPath())) {
            String msg = Warehouse.getQualifiedName(t) + ": " + renamePair.getNewPath() + " already exists?!";
            LOG.error(msg);
            throw new IllegalStateException(msg);
        }
        if (execute) {
            if (!fs.rename(renamePair.getOldPath(), renamePair.getNewPath())) {
                String msg = Warehouse.getQualifiedName(t) + ": " + renamePair.getNewPath() + ": failed to rename";
                LOG.error(msg);
                throw new IllegalStateException(msg);
            }
        }
        // do this with and w/o execute to know what was done
        makeRenameCommand(renamePair.getOldPath(), renamePair.getNewPath(), pw);
    }
    if (!renames.isEmpty()) {
        println(pw, "#End file renames for unbucketed table " + Warehouse.getQualifiedName(t));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Function(java.util.function.Function) RemoteIteratorWithFilter(org.apache.hadoop.hive.metastore.utils.FileUtils.RemoteIteratorWithFilter) FileSystem(org.apache.hadoop.fs.FileSystem) List(java.util.List) ArrayList(java.util.ArrayList) Map(java.util.Map) HashMap(java.util.HashMap) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Aggregations

ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Function (java.util.function.Function)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)1 Path (org.apache.hadoop.fs.Path)1 RemoteIteratorWithFilter (org.apache.hadoop.hive.metastore.utils.FileUtils.RemoteIteratorWithFilter)1 AcidUtils (org.apache.hadoop.hive.ql.io.AcidUtils)1