Search in sources :

Example 16 with HdfsFileStatusWithId

use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.

the class AcidUtils method findOriginals.

/**
   * Find the original files (non-ACID layout) recursively under the partition directory.
   * @param fs the file system
   * @param stat the directory to add
   * @param original the list of original files
   * @throws IOException
   */
private static void findOriginals(FileSystem fs, FileStatus stat, List<HdfsFileStatusWithId> original, Ref<Boolean> useFileIds) throws IOException {
    assert stat.isDir();
    List<HdfsFileStatusWithId> childrenWithId = null;
    Boolean val = useFileIds.value;
    if (val == null || val) {
        try {
            childrenWithId = SHIMS.listLocatedHdfsStatus(fs, stat.getPath(), hiddenFileFilter);
            if (val == null) {
                useFileIds.value = true;
            }
        } catch (Throwable t) {
            LOG.error("Failed to get files with ID; using regular API: " + t.getMessage());
            if (val == null && t instanceof UnsupportedOperationException) {
                useFileIds.value = false;
            }
        }
    }
    if (childrenWithId != null) {
        for (HdfsFileStatusWithId child : childrenWithId) {
            if (child.getFileStatus().isDir()) {
                findOriginals(fs, child.getFileStatus(), original, useFileIds);
            } else {
                original.add(child);
            }
        }
    } else {
        List<FileStatus> children = HdfsUtils.listLocatedStatus(fs, stat.getPath(), hiddenFileFilter);
        for (FileStatus child : children) {
            if (child.isDir()) {
                findOriginals(fs, child, original, useFileIds);
            } else {
                original.add(createOriginalObj(null, child));
            }
        }
    }
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId)

Example 17 with HdfsFileStatusWithId

use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.

the class AcidUtils method findOriginals.

/**
 * Find the original files (non-ACID layout) recursively under the partition directory.
 * @param fs the file system
 * @param stat the directory to add
 * @param original the list of original files
 * @throws IOException
 */
private static void findOriginals(FileSystem fs, FileStatus stat, List<HdfsFileStatusWithId> original, Ref<Boolean> useFileIds, boolean ignoreEmptyFiles) throws IOException {
    assert stat.isDir();
    List<HdfsFileStatusWithId> childrenWithId = null;
    Boolean val = useFileIds.value;
    if (val == null || val) {
        try {
            childrenWithId = SHIMS.listLocatedHdfsStatus(fs, stat.getPath(), hiddenFileFilter);
            if (val == null) {
                useFileIds.value = true;
            }
        } catch (Throwable t) {
            LOG.error("Failed to get files with ID; using regular API: " + t.getMessage());
            if (val == null && t instanceof UnsupportedOperationException) {
                useFileIds.value = false;
            }
        }
    }
    if (childrenWithId != null) {
        for (HdfsFileStatusWithId child : childrenWithId) {
            if (child.getFileStatus().isDir()) {
                findOriginals(fs, child.getFileStatus(), original, useFileIds, ignoreEmptyFiles);
            } else {
                if (!ignoreEmptyFiles || child.getFileStatus().getLen() > 0) {
                    original.add(child);
                }
            }
        }
    } else {
        List<FileStatus> children = HdfsUtils.listLocatedStatus(fs, stat.getPath(), hiddenFileFilter);
        for (FileStatus child : children) {
            if (child.isDir()) {
                findOriginals(fs, child, original, useFileIds, ignoreEmptyFiles);
            } else {
                if (!ignoreEmptyFiles || child.getLen() > 0) {
                    original.add(createOriginalObj(null, child));
                }
            }
        }
    }
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId)

Example 18 with HdfsFileStatusWithId

use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.

the class OrcInputFormat method determineSplitStrategies.

@VisibleForTesting
static List<SplitStrategy<?>> determineSplitStrategies(CombinedCtx combinedCtx, Context context, FileSystem fs, Path dir, List<AcidBaseFileInfo> baseFiles, List<ParsedDelta> parsedDeltas, List<OrcProto.Type> readerTypes, UserGroupInformation ugi, boolean allowSyntheticFileIds) throws IOException {
    List<SplitStrategy<?>> splitStrategies = new ArrayList<SplitStrategy<?>>();
    SplitStrategy<?> splitStrategy;
    boolean checkDefaultFs = HiveConf.getBoolVar(context.conf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID);
    boolean isDefaultFs = (!checkDefaultFs) || ((fs instanceof DistributedFileSystem) && HdfsUtils.isDefaultFs((DistributedFileSystem) fs));
    if (baseFiles.isEmpty()) {
        assert false : "acid 2.0 no base?!: " + dir;
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, Collections.emptyList(), false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
        return splitStrategies;
    }
    List<HdfsFileStatusWithId> acidSchemaFiles = new ArrayList<>();
    List<HdfsFileStatusWithId> originalSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
    // Separate the base files into acid schema and non-acid(original) schema files.
    for (AcidBaseFileInfo acidBaseFileInfo : baseFiles) {
        if (acidBaseFileInfo.isOriginal()) {
            originalSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        } else {
            assert acidBaseFileInfo.isAcidSchema();
            acidSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        }
    }
    // Generate split strategy for non-acid schema original files, if any.
    if (!originalSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, originalSchemaFiles, true, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    // Generate split strategy for acid schema files, if any.
    if (!acidSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    return splitStrategies;
}
Also used : AcidBaseFileInfo(org.apache.hadoop.hive.ql.io.AcidUtils.AcidBaseFileInfo) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) ArrayList(java.util.ArrayList) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 19 with HdfsFileStatusWithId

use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.

the class OrcInputFormat method determineSplitStrategies.

@VisibleForTesting
static List<SplitStrategy<?>> determineSplitStrategies(CombinedCtx combinedCtx, Context context, FileSystem fs, Path dir, List<FileInfo> baseFiles, List<ParsedDelta> parsedDeltas, List<OrcProto.Type> readerTypes, UserGroupInformation ugi, boolean allowSyntheticFileIds) throws IOException {
    List<SplitStrategy<?>> splitStrategies = new ArrayList<SplitStrategy<?>>();
    SplitStrategy<?> splitStrategy;
    boolean checkDefaultFs = HiveConf.getBoolVar(context.conf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID);
    boolean forceSynthetic = !HiveConf.getBoolVar(context.conf, ConfVars.LLAP_IO_USE_FILEID_PATH);
    // if forceSynthetic == true, then assume it is not a defaultFS
    boolean isDefaultFs = (forceSynthetic == false) && ((!checkDefaultFs) || ((fs instanceof DistributedFileSystem) && HdfsUtils.isDefaultFs((DistributedFileSystem) fs)));
    if (baseFiles.isEmpty()) {
        assert false : "acid 2.0 no base?!: " + dir;
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, Collections.emptyList(), false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
        return splitStrategies;
    }
    List<HdfsFileStatusWithId> acidSchemaFiles = new ArrayList<>();
    List<HdfsFileStatusWithId> originalSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
    // Separate the base files into acid schema and non-acid(original) schema files.
    for (FileInfo acidBaseFileInfo : baseFiles) {
        if (acidBaseFileInfo.isOriginal()) {
            originalSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        } else {
            acidSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        }
    }
    // Generate split strategy for non-acid schema original files, if any.
    if (!originalSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, originalSchemaFiles, true, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    // Generate split strategy for acid schema files, if any.
    if (!acidSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    return splitStrategies;
}
Also used : FileInfo(org.apache.hadoop.hive.ql.io.AcidUtils.FileInfo) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) ArrayList(java.util.ArrayList) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 20 with HdfsFileStatusWithId

use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.

the class AcidUtils method getAcidState.

/**
 * GetAcidState implementation which uses the provided dirSnapshot.
 * Generates a new one if needed and the provided one is null.
 * @param fileSystem optional, it it is not provided, it will be derived from the candidateDirectory
 * @param candidateDirectory the partition directory to analyze
 * @param conf the configuration
 * @param writeIdList the list of write ids that we are reading
 * @param useFileIds It will be set to true, if the FileSystem supports listing with fileIds
 * @param ignoreEmptyFiles Ignore files with 0 length
 * @param dirSnapshots The listed directory snapshot, if null new will be generated
 * @return the state of the directory
 * @throws IOException on filesystem errors
 */
private static AcidDirectory getAcidState(FileSystem fileSystem, Path candidateDirectory, Configuration conf, ValidWriteIdList writeIdList, Ref<Boolean> useFileIds, boolean ignoreEmptyFiles, Map<Path, HdfsDirSnapshot> dirSnapshots) throws IOException {
    ValidTxnList validTxnList = getValidTxnList(conf);
    FileSystem fs = fileSystem == null ? candidateDirectory.getFileSystem(conf) : fileSystem;
    AcidDirectory directory = new AcidDirectory(candidateDirectory, fs, useFileIds);
    List<HdfsFileStatusWithId> childrenWithId = HdfsUtils.tryListLocatedHdfsStatus(useFileIds, fs, candidateDirectory, hiddenFileFilter);
    if (childrenWithId != null) {
        for (HdfsFileStatusWithId child : childrenWithId) {
            getChildState(directory, child, writeIdList, validTxnList, ignoreEmptyFiles);
        }
    } else {
        if (dirSnapshots == null) {
            dirSnapshots = getHdfsDirSnapshots(fs, candidateDirectory);
        }
        getChildState(directory, dirSnapshots, writeIdList, validTxnList, ignoreEmptyFiles);
    }
    // If we have a base, the original files are obsolete.
    if (directory.getBase() != null) {
        // Add original files to obsolete list if any
        for (HdfsFileStatusWithId fswid : directory.getOriginalFiles()) {
            directory.getObsolete().add(fswid.getFileStatus().getPath());
        }
        // Add original directories to obsolete list if any
        directory.getObsolete().addAll(directory.getOriginalDirectories());
        // remove the entries so we don't get confused later and think we should
        // use them.
        directory.getOriginalFiles().clear();
        directory.getOriginalDirectories().clear();
    } else {
        // If childrenWithId != null, we would have already populated "original"
        if (childrenWithId != null) {
            for (Path origDir : directory.getOriginalDirectories()) {
                directory.getOriginalFiles().addAll(HdfsUtils.listFileStatusWithId(fs, origDir, useFileIds, true, null));
            }
        }
    }
    // Filter out all delta directories that are shadowed by others
    findBestWorkingDeltas(writeIdList, directory);
    if (directory.getOldestBase() != null && directory.getBase() == null && isCompactedBase(directory.getOldestBase(), fs, dirSnapshots)) {
        /*
       * If here, it means there was a base_x (> 1 perhaps) but none were suitable for given
       * {@link writeIdList}.  Note that 'original' files are logically a base_Long.MIN_VALUE and thus
       * cannot have any data for an open txn.  We could check {@link deltas} has files to cover
       * [1,n] w/o gaps but this would almost never happen...
       *
       * We only throw for base_x produced by Compactor since that base erases all history and
       * cannot be used for a client that has a snapshot in which something inside this base is
       * open.  (Nor can we ignore this base of course)  But base_x which is a result of IOW,
       * contains all history so we treat it just like delta wrt visibility.  Imagine, IOW which
       * aborts. It creates a base_x, which can and should just be ignored.*/
        long[] exceptions = writeIdList.getInvalidWriteIds();
        String minOpenWriteId = exceptions != null && exceptions.length > 0 ? Long.toString(exceptions[0]) : "x";
        throw new IOException(ErrorMsg.ACID_NOT_ENOUGH_HISTORY.format(Long.toString(writeIdList.getHighWatermark()), minOpenWriteId, directory.getOldestBase().toString()));
    }
    Path basePath = directory.getBaseDirectory();
    if (basePath != null) {
        boolean isBaseInRawFormat = MetaDataFile.isRawFormat(basePath, fs, dirSnapshots != null ? dirSnapshots.get(basePath) : null);
        directory.getBase().setRawFormat(isBaseInRawFormat);
    }
    LOG.debug("in directory " + candidateDirectory.toUri().toString() + " base = " + basePath + " deltas = " + directory.getCurrentDirectories().size());
    /*
     * If this sort order is changed and there are tables that have been converted to transactional
     * and have had any update/delete/merge operations performed but not yet MAJOR compacted, it
     * may result in data loss since it may change how
     * {@link org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.OriginalReaderPair} assigns
     * {@link RecordIdentifier#rowId} for read (that have happened) and compaction (yet to happen).
     */
    // this does "Path.uri.compareTo(that.uri)"
    directory.getOriginalFiles().sort(Comparator.comparing(HdfsFileStatusWithId::getFileStatus));
    return directory;
}
Also used : Path(org.apache.hadoop.fs.Path) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) ValidTxnList(org.apache.hadoop.hive.common.ValidTxnList) FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException)

Aggregations

HdfsFileStatusWithId (org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId)22 Path (org.apache.hadoop.fs.Path)12 FileStatus (org.apache.hadoop.fs.FileStatus)10 ArrayList (java.util.ArrayList)8 FileSystem (org.apache.hadoop.fs.FileSystem)6 VisibleForTesting (com.google.common.annotations.VisibleForTesting)5 IOException (java.io.IOException)4 ValidReaderWriteIdList (org.apache.hadoop.hive.common.ValidReaderWriteIdList)4 AcidUtils (org.apache.hadoop.hive.ql.io.AcidUtils)4 Configuration (org.apache.hadoop.conf.Configuration)3 ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)3 JobConf (org.apache.hadoop.mapred.JobConf)3 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)2 ValidReadTxnList (org.apache.hadoop.hive.common.ValidReadTxnList)2 MockFile (org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockFile)2 MockFileSystem (org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockFileSystem)2 MockPath (org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockPath)2 Test (org.junit.Test)2 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)1 Preconditions (com.google.common.base.Preconditions)1