Search in sources :

Example 1 with FileInfo

use of org.apache.hadoop.hive.ql.io.AcidUtils.FileInfo in project hive by apache.

the class OrcInputFormat method determineSplitStrategies.

@VisibleForTesting
static List<SplitStrategy<?>> determineSplitStrategies(CombinedCtx combinedCtx, Context context, FileSystem fs, Path dir, List<FileInfo> baseFiles, List<ParsedDelta> parsedDeltas, List<OrcProto.Type> readerTypes, UserGroupInformation ugi, boolean allowSyntheticFileIds) throws IOException {
    List<SplitStrategy<?>> splitStrategies = new ArrayList<SplitStrategy<?>>();
    SplitStrategy<?> splitStrategy;
    boolean checkDefaultFs = HiveConf.getBoolVar(context.conf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID);
    boolean forceSynthetic = !HiveConf.getBoolVar(context.conf, ConfVars.LLAP_IO_USE_FILEID_PATH);
    // if forceSynthetic == true, then assume it is not a defaultFS
    boolean isDefaultFs = (forceSynthetic == false) && ((!checkDefaultFs) || ((fs instanceof DistributedFileSystem) && HdfsUtils.isDefaultFs((DistributedFileSystem) fs)));
    if (baseFiles.isEmpty()) {
        assert false : "acid 2.0 no base?!: " + dir;
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, Collections.emptyList(), false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
        return splitStrategies;
    }
    List<HdfsFileStatusWithId> acidSchemaFiles = new ArrayList<>();
    List<HdfsFileStatusWithId> originalSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
    // Separate the base files into acid schema and non-acid(original) schema files.
    for (FileInfo acidBaseFileInfo : baseFiles) {
        if (acidBaseFileInfo.isOriginal()) {
            originalSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        } else {
            acidSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        }
    }
    // Generate split strategy for non-acid schema original files, if any.
    if (!originalSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, originalSchemaFiles, true, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    // Generate split strategy for acid schema files, if any.
    if (!acidSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    return splitStrategies;
}
Also used : FileInfo(org.apache.hadoop.hive.ql.io.AcidUtils.FileInfo) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) ArrayList(java.util.ArrayList) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 ArrayList (java.util.ArrayList)1 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)1 FileInfo (org.apache.hadoop.hive.ql.io.AcidUtils.FileInfo)1 HdfsFileStatusWithId (org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId)1