Search in sources :

Example 1 with HdfsFileStatusWithId

use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.

the class AcidUtils method getAcidState.

public static Directory getAcidState(Path directory, Configuration conf, ValidTxnList txnList, Ref<Boolean> useFileIds, boolean ignoreEmptyFiles) throws IOException {
    FileSystem fs = directory.getFileSystem(conf);
    // The following 'deltas' includes all kinds of delta files including insert & delete deltas.
    final List<ParsedDelta> deltas = new ArrayList<ParsedDelta>();
    List<ParsedDelta> working = new ArrayList<ParsedDelta>();
    List<FileStatus> originalDirectories = new ArrayList<FileStatus>();
    final List<FileStatus> obsolete = new ArrayList<FileStatus>();
    List<HdfsFileStatusWithId> childrenWithId = null;
    Boolean val = useFileIds.value;
    if (val == null || val) {
        try {
            childrenWithId = SHIMS.listLocatedHdfsStatus(fs, directory, hiddenFileFilter);
            if (val == null) {
                useFileIds.value = true;
            }
        } catch (Throwable t) {
            LOG.error("Failed to get files with ID; using regular API: " + t.getMessage());
            if (val == null && t instanceof UnsupportedOperationException) {
                useFileIds.value = false;
            }
        }
    }
    TxnBase bestBase = new TxnBase();
    final List<HdfsFileStatusWithId> original = new ArrayList<>();
    if (childrenWithId != null) {
        for (HdfsFileStatusWithId child : childrenWithId) {
            getChildState(child.getFileStatus(), child, txnList, working, originalDirectories, original, obsolete, bestBase, ignoreEmptyFiles);
        }
    } else {
        List<FileStatus> children = HdfsUtils.listLocatedStatus(fs, directory, hiddenFileFilter);
        for (FileStatus child : children) {
            getChildState(child, null, txnList, working, originalDirectories, original, obsolete, bestBase, ignoreEmptyFiles);
        }
    }
    // If we have a base, the original files are obsolete.
    if (bestBase.status != null) {
        // Add original files to obsolete list if any
        for (HdfsFileStatusWithId fswid : original) {
            obsolete.add(fswid.getFileStatus());
        }
        // Add original direcotries to obsolete list if any
        obsolete.addAll(originalDirectories);
        // remove the entries so we don't get confused later and think we should
        // use them.
        original.clear();
        originalDirectories.clear();
    } else {
        // really need.
        for (FileStatus origDir : originalDirectories) {
            findOriginals(fs, origDir, original, useFileIds);
        }
    }
    Collections.sort(working);
    //so now, 'working' should be sorted like delta_5_20 delta_5_10 delta_11_20 delta_51_60 for example
    //and we want to end up with the best set containing all relevant data: delta_5_20 delta_51_60,
    //subject to list of 'exceptions' in 'txnList' (not show in above example).
    long current = bestBase.txn;
    int lastStmtId = -1;
    ParsedDelta prev = null;
    for (ParsedDelta next : working) {
        if (next.maxTransaction > current) {
            // are any of the new transactions ones that we care about?
            if (txnList.isTxnRangeValid(current + 1, next.maxTransaction) != ValidTxnList.RangeResponse.NONE) {
                deltas.add(next);
                current = next.maxTransaction;
                lastStmtId = next.statementId;
                prev = next;
            }
        } else if (next.maxTransaction == current && lastStmtId >= 0) {
            //make sure to get all deltas within a single transaction;  multi-statement txn
            //generate multiple delta files with the same txnId range
            //of course, if maxTransaction has already been minor compacted, all per statement deltas are obsolete
            deltas.add(next);
            prev = next;
        } else if (prev != null && next.maxTransaction == prev.maxTransaction && next.minTransaction == prev.minTransaction && next.statementId == prev.statementId) {
            // The 'next' parsedDelta may have everything equal to the 'prev' parsedDelta, except
            // the path. This may happen when we have split update and we have two types of delta
            // directories- 'delta_x_y' and 'delete_delta_x_y' for the SAME txn range.
            // Also note that any delete_deltas in between a given delta_x_y range would be made
            // obsolete. For example, a delta_30_50 would make delete_delta_40_40 obsolete.
            // This is valid because minor compaction always compacts the normal deltas and the delete
            // deltas for the same range. That is, if we had 3 directories, delta_30_30,
            // delete_delta_40_40 and delta_50_50, then running minor compaction would produce
            // delta_30_50 and delete_delta_30_50.
            deltas.add(next);
            prev = next;
        } else {
            obsolete.add(next.path);
        }
    }
    if (bestBase.oldestBase != null && bestBase.status == null) {
        /**
       * If here, it means there was a base_x (> 1 perhaps) but none were suitable for given
       * {@link txnList}.  Note that 'original' files are logically a base_Long.MIN_VALUE and thus
       * cannot have any data for an open txn.  We could check {@link deltas} has files to cover
       * [1,n] w/o gaps but this would almost never happen...*/
        long[] exceptions = txnList.getInvalidTransactions();
        String minOpenTxn = exceptions != null && exceptions.length > 0 ? Long.toString(exceptions[0]) : "x";
        throw new IOException(ErrorMsg.ACID_NOT_ENOUGH_HISTORY.format(Long.toString(txnList.getHighWatermark()), minOpenTxn, bestBase.oldestBase.toString()));
    }
    final Path base = bestBase.status == null ? null : bestBase.status.getPath();
    LOG.debug("in directory " + directory.toUri().toString() + " base = " + base + " deltas = " + deltas.size());
    return new Directory() {

        @Override
        public Path getBaseDirectory() {
            return base;
        }

        @Override
        public List<HdfsFileStatusWithId> getOriginalFiles() {
            return original;
        }

        @Override
        public List<ParsedDelta> getCurrentDirectories() {
            return deltas;
        }

        @Override
        public List<FileStatus> getObsolete() {
            return obsolete;
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) ArrayList(java.util.ArrayList) IOException(java.io.IOException) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 2 with HdfsFileStatusWithId

use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.

the class OrcInputFormat method determineSplitStrategies.

@VisibleForTesting
static List<SplitStrategy<?>> determineSplitStrategies(CombinedCtx combinedCtx, Context context, FileSystem fs, Path dir, AcidUtils.Directory dirInfo, List<AcidBaseFileInfo> baseFiles, List<ParsedDelta> parsedDeltas, List<OrcProto.Type> readerTypes, UserGroupInformation ugi, boolean allowSyntheticFileIds) {
    List<SplitStrategy<?>> splitStrategies = new ArrayList<SplitStrategy<?>>();
    SplitStrategy<?> splitStrategy;
    // When no baseFiles, we will just generate a single split strategy and return.
    List<HdfsFileStatusWithId> acidSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
    if (baseFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, dirInfo, acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
        // return here
        return splitStrategies;
    }
    List<HdfsFileStatusWithId> originalSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
    // Separate the base files into acid schema and non-acid(original) schema files.
    for (AcidBaseFileInfo acidBaseFileInfo : baseFiles) {
        if (acidBaseFileInfo.isOriginal()) {
            originalSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        } else {
            acidSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        }
    }
    // Generate split strategy for non-acid schema original files, if any.
    if (!originalSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, dirInfo, originalSchemaFiles, true, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    // Generate split strategy for acid schema files, if any.
    if (!acidSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, dirInfo, acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    return splitStrategies;
}
Also used : AcidBaseFileInfo(org.apache.hadoop.hive.ql.io.AcidUtils.AcidBaseFileInfo) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) ArrayList(java.util.ArrayList) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with HdfsFileStatusWithId

use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.

the class AcidUtils method getAcidState.

public static Directory getAcidState(Path directory, Configuration conf, ValidWriteIdList writeIdList, Ref<Boolean> useFileIds, boolean ignoreEmptyFiles, Map<String, String> tblproperties) throws IOException {
    FileSystem fs = directory.getFileSystem(conf);
    // The following 'deltas' includes all kinds of delta files including insert & delete deltas.
    final List<ParsedDelta> deltas = new ArrayList<ParsedDelta>();
    List<ParsedDelta> working = new ArrayList<ParsedDelta>();
    List<FileStatus> originalDirectories = new ArrayList<FileStatus>();
    final List<FileStatus> obsolete = new ArrayList<FileStatus>();
    final List<FileStatus> abortedDirectories = new ArrayList<>();
    List<HdfsFileStatusWithId> childrenWithId = null;
    Boolean val = useFileIds.value;
    if (val == null || val) {
        try {
            childrenWithId = SHIMS.listLocatedHdfsStatus(fs, directory, hiddenFileFilter);
            if (val == null) {
                useFileIds.value = true;
            }
        } catch (Throwable t) {
            LOG.error("Failed to get files with ID; using regular API: " + t.getMessage());
            if (val == null && t instanceof UnsupportedOperationException) {
                useFileIds.value = false;
            }
        }
    }
    TxnBase bestBase = new TxnBase();
    final List<HdfsFileStatusWithId> original = new ArrayList<>();
    if (childrenWithId != null) {
        for (HdfsFileStatusWithId child : childrenWithId) {
            getChildState(child.getFileStatus(), child, writeIdList, working, originalDirectories, original, obsolete, bestBase, ignoreEmptyFiles, abortedDirectories, tblproperties, fs);
        }
    } else {
        List<FileStatus> children = HdfsUtils.listLocatedStatus(fs, directory, hiddenFileFilter);
        for (FileStatus child : children) {
            getChildState(child, null, writeIdList, working, originalDirectories, original, obsolete, bestBase, ignoreEmptyFiles, abortedDirectories, tblproperties, fs);
        }
    }
    // If we have a base, the original files are obsolete.
    if (bestBase.status != null) {
        // Add original files to obsolete list if any
        for (HdfsFileStatusWithId fswid : original) {
            obsolete.add(fswid.getFileStatus());
        }
        // Add original direcotries to obsolete list if any
        obsolete.addAll(originalDirectories);
        // remove the entries so we don't get confused later and think we should
        // use them.
        original.clear();
        originalDirectories.clear();
    } else {
        // really need.
        for (FileStatus origDir : originalDirectories) {
            findOriginals(fs, origDir, original, useFileIds, ignoreEmptyFiles);
        }
    }
    Collections.sort(working);
    // so now, 'working' should be sorted like delta_5_20 delta_5_10 delta_11_20 delta_51_60 for example
    // and we want to end up with the best set containing all relevant data: delta_5_20 delta_51_60,
    // subject to list of 'exceptions' in 'writeIdList' (not show in above example).
    long current = bestBase.writeId;
    int lastStmtId = -1;
    ParsedDelta prev = null;
    for (ParsedDelta next : working) {
        if (next.maxWriteId > current) {
            // are any of the new transactions ones that we care about?
            if (writeIdList.isWriteIdRangeValid(current + 1, next.maxWriteId) != ValidWriteIdList.RangeResponse.NONE) {
                deltas.add(next);
                current = next.maxWriteId;
                lastStmtId = next.statementId;
                prev = next;
            }
        } else if (next.maxWriteId == current && lastStmtId >= 0) {
            // make sure to get all deltas within a single transaction;  multi-statement txn
            // generate multiple delta files with the same txnId range
            // of course, if maxWriteId has already been minor compacted, all per statement deltas are obsolete
            deltas.add(next);
            prev = next;
        } else if (prev != null && next.maxWriteId == prev.maxWriteId && next.minWriteId == prev.minWriteId && next.statementId == prev.statementId) {
            // The 'next' parsedDelta may have everything equal to the 'prev' parsedDelta, except
            // the path. This may happen when we have split update and we have two types of delta
            // directories- 'delta_x_y' and 'delete_delta_x_y' for the SAME txn range.
            // Also note that any delete_deltas in between a given delta_x_y range would be made
            // obsolete. For example, a delta_30_50 would make delete_delta_40_40 obsolete.
            // This is valid because minor compaction always compacts the normal deltas and the delete
            // deltas for the same range. That is, if we had 3 directories, delta_30_30,
            // delete_delta_40_40 and delta_50_50, then running minor compaction would produce
            // delta_30_50 and delete_delta_30_50.
            deltas.add(next);
            prev = next;
        } else {
            obsolete.add(next.path);
        }
    }
    if (bestBase.oldestBase != null && bestBase.status == null) {
        /**
         * If here, it means there was a base_x (> 1 perhaps) but none were suitable for given
         * {@link writeIdList}.  Note that 'original' files are logically a base_Long.MIN_VALUE and thus
         * cannot have any data for an open txn.  We could check {@link deltas} has files to cover
         * [1,n] w/o gaps but this would almost never happen...
         */
        long[] exceptions = writeIdList.getInvalidWriteIds();
        String minOpenWriteId = exceptions != null && exceptions.length > 0 ? Long.toString(exceptions[0]) : "x";
        throw new IOException(ErrorMsg.ACID_NOT_ENOUGH_HISTORY.format(Long.toString(writeIdList.getHighWatermark()), minOpenWriteId, bestBase.oldestBase.toString()));
    }
    final Path base = bestBase.status == null ? null : bestBase.status.getPath();
    LOG.debug("in directory " + directory.toUri().toString() + " base = " + base + " deltas = " + deltas.size());
    /**
     * If this sort order is changed and there are tables that have been converted to transactional
     * and have had any update/delete/merge operations performed but not yet MAJOR compacted, it
     * may result in data loss since it may change how
     * {@link org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.OriginalReaderPair} assigns
     * {@link RecordIdentifier#rowId} for read (that have happened) and compaction (yet to happen).
     */
    Collections.sort(original, (HdfsFileStatusWithId o1, HdfsFileStatusWithId o2) -> {
        // this does "Path.uri.compareTo(that.uri)"
        return o1.getFileStatus().compareTo(o2.getFileStatus());
    });
    // Note: isRawFormat is invalid for non-ORC tables. It will always return true, so we're good.
    final boolean isBaseInRawFormat = base != null && MetaDataFile.isRawFormat(base, fs);
    return new Directory() {

        @Override
        public Path getBaseDirectory() {
            return base;
        }

        @Override
        public boolean isBaseInRawFormat() {
            return isBaseInRawFormat;
        }

        @Override
        public List<HdfsFileStatusWithId> getOriginalFiles() {
            return original;
        }

        @Override
        public List<ParsedDelta> getCurrentDirectories() {
            return deltas;
        }

        @Override
        public List<FileStatus> getObsolete() {
            return obsolete;
        }

        @Override
        public List<FileStatus> getAbortedDirectories() {
            return abortedDirectories;
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) ArrayList(java.util.ArrayList) IOException(java.io.IOException) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 4 with HdfsFileStatusWithId

use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.

the class CompactorMR method run.

/**
 * Run Compaction which may consist of several jobs on the cluster.
 * @param conf Hive configuration file
 * @param jobName name to run this job with
 * @param t metastore table
 * @param sd metastore storage descriptor
 * @param writeIds list of valid write ids
 * @param ci CompactionInfo
 * @throws java.io.IOException if the job fails
 */
void run(HiveConf conf, String jobName, Table t, StorageDescriptor sd, ValidWriteIdList writeIds, CompactionInfo ci, Worker.StatsUpdater su, TxnStore txnHandler) throws IOException {
    if (conf.getBoolVar(HiveConf.ConfVars.HIVE_IN_TEST) && conf.getBoolVar(HiveConf.ConfVars.HIVETESTMODEFAILCOMPACTION)) {
        throw new RuntimeException(HiveConf.ConfVars.HIVETESTMODEFAILCOMPACTION.name() + "=true");
    }
    // We just need to delete the directories for aborted transactions.
    if (AcidUtils.isInsertOnlyTable(t.getParameters())) {
        LOG.debug("Going to delete directories for aborted transactions for MM table " + t.getDbName() + "." + t.getTableName());
        removeFiles(conf, sd.getLocation(), writeIds, t);
        return;
    }
    JobConf job = createBaseJobConf(conf, jobName, t, sd, writeIds, ci);
    // Figure out and encode what files we need to read.  We do this here (rather than in
    // getSplits below) because as part of this we discover our minimum and maximum transactions,
    // and discovering that in getSplits is too late as we then have no way to pass it to our
    // mapper.
    AcidUtils.Directory dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, writeIds, false, true);
    List<AcidUtils.ParsedDelta> parsedDeltas = dir.getCurrentDirectories();
    int maxDeltastoHandle = conf.getIntVar(HiveConf.ConfVars.COMPACTOR_MAX_NUM_DELTA);
    if (parsedDeltas.size() > maxDeltastoHandle) {
        /**
         * if here, that means we have very high number of delta files.  This may be sign of a temporary
         * glitch or a real issue.  For example, if transaction batch size or transaction size is set too
         * low for the event flow rate in Streaming API, it may generate lots of delta files very
         * quickly.  Another possibility is that Compaction is repeatedly failing and not actually compacting.
         * Thus, force N minor compactions first to reduce number of deltas and then follow up with
         * the compaction actually requested in {@link ci} which now needs to compact a lot fewer deltas
         */
        LOG.warn(parsedDeltas.size() + " delta files found for " + ci.getFullPartitionName() + " located at " + sd.getLocation() + "! This is likely a sign of misconfiguration, " + "especially if this message repeats.  Check that compaction is running properly.  Check for any " + "runaway/mis-configured process writing to ACID tables, especially using Streaming Ingest API.");
        int numMinorCompactions = parsedDeltas.size() / maxDeltastoHandle;
        for (int jobSubId = 0; jobSubId < numMinorCompactions; jobSubId++) {
            JobConf jobMinorCompact = createBaseJobConf(conf, jobName + "_" + jobSubId, t, sd, writeIds, ci);
            launchCompactionJob(jobMinorCompact, null, CompactionType.MINOR, null, parsedDeltas.subList(jobSubId * maxDeltastoHandle, (jobSubId + 1) * maxDeltastoHandle), maxDeltastoHandle, -1, conf, txnHandler, ci.id, jobName);
        }
        // now recompute state since we've done minor compactions and have different 'best' set of deltas
        dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, writeIds);
    }
    StringableList dirsToSearch = new StringableList();
    Path baseDir = null;
    if (ci.isMajorCompaction()) {
        // There may not be a base dir if the partition was empty before inserts or if this
        // partition is just now being converted to ACID.
        baseDir = dir.getBaseDirectory();
        if (baseDir == null) {
            List<HdfsFileStatusWithId> originalFiles = dir.getOriginalFiles();
            if (!(originalFiles == null) && !(originalFiles.size() == 0)) {
                // There are original format files
                for (HdfsFileStatusWithId stat : originalFiles) {
                    Path path = stat.getFileStatus().getPath();
                    // note that originalFiles are all original files recursively not dirs
                    dirsToSearch.add(path);
                    LOG.debug("Adding original file " + path + " to dirs to search");
                }
                // Set base to the location so that the input format reads the original files.
                baseDir = new Path(sd.getLocation());
            }
        } else {
            // add our base to the list of directories to search for files in.
            LOG.debug("Adding base directory " + baseDir + " to dirs to search");
            dirsToSearch.add(baseDir);
        }
    }
    if (parsedDeltas.size() == 0 && dir.getOriginalFiles().size() == 0) {
        // Skip compaction if there's no delta files AND there's no original files
        String minOpenInfo = ".";
        if (writeIds.getMinOpenWriteId() != null) {
            minOpenInfo = " with min Open " + JavaUtils.writeIdToString(writeIds.getMinOpenWriteId()) + ".  Compaction cannot compact above this writeId";
        }
        LOG.error("No delta files or original files found to compact in " + sd.getLocation() + " for compactionId=" + ci.id + minOpenInfo);
        return;
    }
    launchCompactionJob(job, baseDir, ci.type, dirsToSearch, dir.getCurrentDirectories(), dir.getCurrentDirectories().size(), dir.getObsolete().size(), conf, txnHandler, ci.id, jobName);
    su.gatherStats();
}
Also used : Path(org.apache.hadoop.fs.Path) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) JobConf(org.apache.hadoop.mapred.JobConf) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 5 with HdfsFileStatusWithId

use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.

the class Initiator method determineCompactionType.

private CompactionType determineCompactionType(CompactionInfo ci, ValidWriteIdList writeIds, StorageDescriptor sd, Map<String, String> tblproperties) throws IOException, InterruptedException {
    boolean noBase = false;
    Path location = new Path(sd.getLocation());
    FileSystem fs = location.getFileSystem(conf);
    AcidUtils.Directory dir = AcidUtils.getAcidState(location, conf, writeIds, false, false);
    Path base = dir.getBaseDirectory();
    long baseSize = 0;
    FileStatus stat = null;
    if (base != null) {
        stat = fs.getFileStatus(base);
        if (!stat.isDir()) {
            LOG.error("Was assuming base " + base.toString() + " is directory, but it's a file!");
            return null;
        }
        baseSize = sumDirSize(fs, base);
    }
    List<HdfsFileStatusWithId> originals = dir.getOriginalFiles();
    for (HdfsFileStatusWithId origStat : originals) {
        baseSize += origStat.getFileStatus().getLen();
    }
    long deltaSize = 0;
    List<AcidUtils.ParsedDelta> deltas = dir.getCurrentDirectories();
    for (AcidUtils.ParsedDelta delta : deltas) {
        stat = fs.getFileStatus(delta.getPath());
        if (!stat.isDir()) {
            LOG.error("Was assuming delta " + delta.getPath().toString() + " is a directory, " + "but it's a file!");
            return null;
        }
        deltaSize += sumDirSize(fs, delta.getPath());
    }
    if (baseSize == 0 && deltaSize > 0) {
        noBase = true;
    } else {
        String deltaPctProp = tblproperties.get(COMPACTORTHRESHOLD_PREFIX + HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_PCT_THRESHOLD);
        float deltaPctThreshold = deltaPctProp == null ? HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_PCT_THRESHOLD) : Float.parseFloat(deltaPctProp);
        boolean bigEnough = (float) deltaSize / (float) baseSize > deltaPctThreshold;
        if (LOG.isDebugEnabled()) {
            StringBuilder msg = new StringBuilder("delta size: ");
            msg.append(deltaSize);
            msg.append(" base size: ");
            msg.append(baseSize);
            msg.append(" threshold: ");
            msg.append(deltaPctThreshold);
            msg.append(" will major compact: ");
            msg.append(bigEnough);
            LOG.debug(msg.toString());
        }
        if (bigEnough)
            return CompactionType.MAJOR;
    }
    String deltaNumProp = tblproperties.get(COMPACTORTHRESHOLD_PREFIX + HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD);
    int deltaNumThreshold = deltaNumProp == null ? HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD) : Integer.parseInt(deltaNumProp);
    boolean enough = deltas.size() > deltaNumThreshold;
    if (enough) {
        LOG.debug("Found " + deltas.size() + " delta files, threshold is " + deltaNumThreshold + (enough ? "" : "not") + " and no base, requesting " + (noBase ? "major" : "minor") + " compaction");
        // If there's no base file, do a major compaction
        return noBase ? CompactionType.MAJOR : CompactionType.MINOR;
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) FileSystem(org.apache.hadoop.fs.FileSystem) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Aggregations

HdfsFileStatusWithId (org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId)22 Path (org.apache.hadoop.fs.Path)12 FileStatus (org.apache.hadoop.fs.FileStatus)10 ArrayList (java.util.ArrayList)8 FileSystem (org.apache.hadoop.fs.FileSystem)6 VisibleForTesting (com.google.common.annotations.VisibleForTesting)5 IOException (java.io.IOException)4 ValidReaderWriteIdList (org.apache.hadoop.hive.common.ValidReaderWriteIdList)4 AcidUtils (org.apache.hadoop.hive.ql.io.AcidUtils)4 Configuration (org.apache.hadoop.conf.Configuration)3 ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)3 JobConf (org.apache.hadoop.mapred.JobConf)3 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)2 ValidReadTxnList (org.apache.hadoop.hive.common.ValidReadTxnList)2 MockFile (org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockFile)2 MockFileSystem (org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockFileSystem)2 MockPath (org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockPath)2 Test (org.junit.Test)2 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)1 Preconditions (com.google.common.base.Preconditions)1