use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.
the class AcidUtils method findOriginals.
/**
* Find the original files (non-ACID layout) recursively under the partition directory.
* @param fs the file system
* @param stat the directory to add
* @param original the list of original files
* @throws IOException
*/
private static void findOriginals(FileSystem fs, FileStatus stat, List<HdfsFileStatusWithId> original, Ref<Boolean> useFileIds) throws IOException {
assert stat.isDir();
List<HdfsFileStatusWithId> childrenWithId = null;
Boolean val = useFileIds.value;
if (val == null || val) {
try {
childrenWithId = SHIMS.listLocatedHdfsStatus(fs, stat.getPath(), hiddenFileFilter);
if (val == null) {
useFileIds.value = true;
}
} catch (Throwable t) {
LOG.error("Failed to get files with ID; using regular API: " + t.getMessage());
if (val == null && t instanceof UnsupportedOperationException) {
useFileIds.value = false;
}
}
}
if (childrenWithId != null) {
for (HdfsFileStatusWithId child : childrenWithId) {
if (child.getFileStatus().isDir()) {
findOriginals(fs, child.getFileStatus(), original, useFileIds);
} else {
original.add(child);
}
}
} else {
List<FileStatus> children = HdfsUtils.listLocatedStatus(fs, stat.getPath(), hiddenFileFilter);
for (FileStatus child : children) {
if (child.isDir()) {
findOriginals(fs, child, original, useFileIds);
} else {
original.add(createOriginalObj(null, child));
}
}
}
}
use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.
the class AcidUtils method findOriginals.
/**
* Find the original files (non-ACID layout) recursively under the partition directory.
* @param fs the file system
* @param stat the directory to add
* @param original the list of original files
* @throws IOException
*/
private static void findOriginals(FileSystem fs, FileStatus stat, List<HdfsFileStatusWithId> original, Ref<Boolean> useFileIds, boolean ignoreEmptyFiles) throws IOException {
assert stat.isDir();
List<HdfsFileStatusWithId> childrenWithId = null;
Boolean val = useFileIds.value;
if (val == null || val) {
try {
childrenWithId = SHIMS.listLocatedHdfsStatus(fs, stat.getPath(), hiddenFileFilter);
if (val == null) {
useFileIds.value = true;
}
} catch (Throwable t) {
LOG.error("Failed to get files with ID; using regular API: " + t.getMessage());
if (val == null && t instanceof UnsupportedOperationException) {
useFileIds.value = false;
}
}
}
if (childrenWithId != null) {
for (HdfsFileStatusWithId child : childrenWithId) {
if (child.getFileStatus().isDir()) {
findOriginals(fs, child.getFileStatus(), original, useFileIds, ignoreEmptyFiles);
} else {
if (!ignoreEmptyFiles || child.getFileStatus().getLen() > 0) {
original.add(child);
}
}
}
} else {
List<FileStatus> children = HdfsUtils.listLocatedStatus(fs, stat.getPath(), hiddenFileFilter);
for (FileStatus child : children) {
if (child.isDir()) {
findOriginals(fs, child, original, useFileIds, ignoreEmptyFiles);
} else {
if (!ignoreEmptyFiles || child.getLen() > 0) {
original.add(createOriginalObj(null, child));
}
}
}
}
}
use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.
the class OrcInputFormat method determineSplitStrategies.
@VisibleForTesting
static List<SplitStrategy<?>> determineSplitStrategies(CombinedCtx combinedCtx, Context context, FileSystem fs, Path dir, List<AcidBaseFileInfo> baseFiles, List<ParsedDelta> parsedDeltas, List<OrcProto.Type> readerTypes, UserGroupInformation ugi, boolean allowSyntheticFileIds) throws IOException {
List<SplitStrategy<?>> splitStrategies = new ArrayList<SplitStrategy<?>>();
SplitStrategy<?> splitStrategy;
boolean checkDefaultFs = HiveConf.getBoolVar(context.conf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID);
boolean isDefaultFs = (!checkDefaultFs) || ((fs instanceof DistributedFileSystem) && HdfsUtils.isDefaultFs((DistributedFileSystem) fs));
if (baseFiles.isEmpty()) {
assert false : "acid 2.0 no base?!: " + dir;
splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, Collections.emptyList(), false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
if (splitStrategy != null) {
splitStrategies.add(splitStrategy);
}
return splitStrategies;
}
List<HdfsFileStatusWithId> acidSchemaFiles = new ArrayList<>();
List<HdfsFileStatusWithId> originalSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
// Separate the base files into acid schema and non-acid(original) schema files.
for (AcidBaseFileInfo acidBaseFileInfo : baseFiles) {
if (acidBaseFileInfo.isOriginal()) {
originalSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
} else {
assert acidBaseFileInfo.isAcidSchema();
acidSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
}
}
// Generate split strategy for non-acid schema original files, if any.
if (!originalSchemaFiles.isEmpty()) {
splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, originalSchemaFiles, true, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
if (splitStrategy != null) {
splitStrategies.add(splitStrategy);
}
}
// Generate split strategy for acid schema files, if any.
if (!acidSchemaFiles.isEmpty()) {
splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
if (splitStrategy != null) {
splitStrategies.add(splitStrategy);
}
}
return splitStrategies;
}
use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.
the class OrcInputFormat method determineSplitStrategies.
@VisibleForTesting
static List<SplitStrategy<?>> determineSplitStrategies(CombinedCtx combinedCtx, Context context, FileSystem fs, Path dir, List<FileInfo> baseFiles, List<ParsedDelta> parsedDeltas, List<OrcProto.Type> readerTypes, UserGroupInformation ugi, boolean allowSyntheticFileIds) throws IOException {
List<SplitStrategy<?>> splitStrategies = new ArrayList<SplitStrategy<?>>();
SplitStrategy<?> splitStrategy;
boolean checkDefaultFs = HiveConf.getBoolVar(context.conf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID);
boolean forceSynthetic = !HiveConf.getBoolVar(context.conf, ConfVars.LLAP_IO_USE_FILEID_PATH);
// if forceSynthetic == true, then assume it is not a defaultFS
boolean isDefaultFs = (forceSynthetic == false) && ((!checkDefaultFs) || ((fs instanceof DistributedFileSystem) && HdfsUtils.isDefaultFs((DistributedFileSystem) fs)));
if (baseFiles.isEmpty()) {
assert false : "acid 2.0 no base?!: " + dir;
splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, Collections.emptyList(), false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
if (splitStrategy != null) {
splitStrategies.add(splitStrategy);
}
return splitStrategies;
}
List<HdfsFileStatusWithId> acidSchemaFiles = new ArrayList<>();
List<HdfsFileStatusWithId> originalSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
// Separate the base files into acid schema and non-acid(original) schema files.
for (FileInfo acidBaseFileInfo : baseFiles) {
if (acidBaseFileInfo.isOriginal()) {
originalSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
} else {
acidSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
}
}
// Generate split strategy for non-acid schema original files, if any.
if (!originalSchemaFiles.isEmpty()) {
splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, originalSchemaFiles, true, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
if (splitStrategy != null) {
splitStrategies.add(splitStrategy);
}
}
// Generate split strategy for acid schema files, if any.
if (!acidSchemaFiles.isEmpty()) {
splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
if (splitStrategy != null) {
splitStrategies.add(splitStrategy);
}
}
return splitStrategies;
}
use of org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId in project hive by apache.
the class AcidUtils method getAcidState.
/**
* GetAcidState implementation which uses the provided dirSnapshot.
* Generates a new one if needed and the provided one is null.
* @param fileSystem optional, it it is not provided, it will be derived from the candidateDirectory
* @param candidateDirectory the partition directory to analyze
* @param conf the configuration
* @param writeIdList the list of write ids that we are reading
* @param useFileIds It will be set to true, if the FileSystem supports listing with fileIds
* @param ignoreEmptyFiles Ignore files with 0 length
* @param dirSnapshots The listed directory snapshot, if null new will be generated
* @return the state of the directory
* @throws IOException on filesystem errors
*/
private static AcidDirectory getAcidState(FileSystem fileSystem, Path candidateDirectory, Configuration conf, ValidWriteIdList writeIdList, Ref<Boolean> useFileIds, boolean ignoreEmptyFiles, Map<Path, HdfsDirSnapshot> dirSnapshots) throws IOException {
ValidTxnList validTxnList = getValidTxnList(conf);
FileSystem fs = fileSystem == null ? candidateDirectory.getFileSystem(conf) : fileSystem;
AcidDirectory directory = new AcidDirectory(candidateDirectory, fs, useFileIds);
List<HdfsFileStatusWithId> childrenWithId = HdfsUtils.tryListLocatedHdfsStatus(useFileIds, fs, candidateDirectory, hiddenFileFilter);
if (childrenWithId != null) {
for (HdfsFileStatusWithId child : childrenWithId) {
getChildState(directory, child, writeIdList, validTxnList, ignoreEmptyFiles);
}
} else {
if (dirSnapshots == null) {
dirSnapshots = getHdfsDirSnapshots(fs, candidateDirectory);
}
getChildState(directory, dirSnapshots, writeIdList, validTxnList, ignoreEmptyFiles);
}
// If we have a base, the original files are obsolete.
if (directory.getBase() != null) {
// Add original files to obsolete list if any
for (HdfsFileStatusWithId fswid : directory.getOriginalFiles()) {
directory.getObsolete().add(fswid.getFileStatus().getPath());
}
// Add original directories to obsolete list if any
directory.getObsolete().addAll(directory.getOriginalDirectories());
// remove the entries so we don't get confused later and think we should
// use them.
directory.getOriginalFiles().clear();
directory.getOriginalDirectories().clear();
} else {
// If childrenWithId != null, we would have already populated "original"
if (childrenWithId != null) {
for (Path origDir : directory.getOriginalDirectories()) {
directory.getOriginalFiles().addAll(HdfsUtils.listFileStatusWithId(fs, origDir, useFileIds, true, null));
}
}
}
// Filter out all delta directories that are shadowed by others
findBestWorkingDeltas(writeIdList, directory);
if (directory.getOldestBase() != null && directory.getBase() == null && isCompactedBase(directory.getOldestBase(), fs, dirSnapshots)) {
/*
* If here, it means there was a base_x (> 1 perhaps) but none were suitable for given
* {@link writeIdList}. Note that 'original' files are logically a base_Long.MIN_VALUE and thus
* cannot have any data for an open txn. We could check {@link deltas} has files to cover
* [1,n] w/o gaps but this would almost never happen...
*
* We only throw for base_x produced by Compactor since that base erases all history and
* cannot be used for a client that has a snapshot in which something inside this base is
* open. (Nor can we ignore this base of course) But base_x which is a result of IOW,
* contains all history so we treat it just like delta wrt visibility. Imagine, IOW which
* aborts. It creates a base_x, which can and should just be ignored.*/
long[] exceptions = writeIdList.getInvalidWriteIds();
String minOpenWriteId = exceptions != null && exceptions.length > 0 ? Long.toString(exceptions[0]) : "x";
throw new IOException(ErrorMsg.ACID_NOT_ENOUGH_HISTORY.format(Long.toString(writeIdList.getHighWatermark()), minOpenWriteId, directory.getOldestBase().toString()));
}
Path basePath = directory.getBaseDirectory();
if (basePath != null) {
boolean isBaseInRawFormat = MetaDataFile.isRawFormat(basePath, fs, dirSnapshots != null ? dirSnapshots.get(basePath) : null);
directory.getBase().setRawFormat(isBaseInRawFormat);
}
LOG.debug("in directory " + candidateDirectory.toUri().toString() + " base = " + basePath + " deltas = " + directory.getCurrentDirectories().size());
/*
* If this sort order is changed and there are tables that have been converted to transactional
* and have had any update/delete/merge operations performed but not yet MAJOR compacted, it
* may result in data loss since it may change how
* {@link org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.OriginalReaderPair} assigns
* {@link RecordIdentifier#rowId} for read (that have happened) and compaction (yet to happen).
*/
// this does "Path.uri.compareTo(that.uri)"
directory.getOriginalFiles().sort(Comparator.comparing(HdfsFileStatusWithId::getFileStatus));
return directory;
}
Aggregations