Search in sources :

Example 1 with AcidBaseFileInfo

use of org.apache.hadoop.hive.ql.io.AcidUtils.AcidBaseFileInfo in project hive by apache.

the class OrcInputFormat method determineSplitStrategies.

@VisibleForTesting
static List<SplitStrategy<?>> determineSplitStrategies(CombinedCtx combinedCtx, Context context, FileSystem fs, Path dir, AcidUtils.Directory dirInfo, List<AcidBaseFileInfo> baseFiles, List<ParsedDelta> parsedDeltas, List<OrcProto.Type> readerTypes, UserGroupInformation ugi, boolean allowSyntheticFileIds) {
    List<SplitStrategy<?>> splitStrategies = new ArrayList<SplitStrategy<?>>();
    SplitStrategy<?> splitStrategy;
    // When no baseFiles, we will just generate a single split strategy and return.
    List<HdfsFileStatusWithId> acidSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
    if (baseFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, dirInfo, acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
        // return here
        return splitStrategies;
    }
    List<HdfsFileStatusWithId> originalSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
    // Separate the base files into acid schema and non-acid(original) schema files.
    for (AcidBaseFileInfo acidBaseFileInfo : baseFiles) {
        if (acidBaseFileInfo.isOriginal()) {
            originalSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        } else {
            acidSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        }
    }
    // Generate split strategy for non-acid schema original files, if any.
    if (!originalSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, dirInfo, originalSchemaFiles, true, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    // Generate split strategy for acid schema files, if any.
    if (!acidSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, dirInfo, acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    return splitStrategies;
}
Also used : AcidBaseFileInfo(org.apache.hadoop.hive.ql.io.AcidUtils.AcidBaseFileInfo) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) ArrayList(java.util.ArrayList) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with AcidBaseFileInfo

use of org.apache.hadoop.hive.ql.io.AcidUtils.AcidBaseFileInfo in project hive by apache.

the class OrcInputFormat method determineSplitStrategies.

@VisibleForTesting
static List<SplitStrategy<?>> determineSplitStrategies(CombinedCtx combinedCtx, Context context, FileSystem fs, Path dir, List<AcidBaseFileInfo> baseFiles, List<ParsedDelta> parsedDeltas, List<OrcProto.Type> readerTypes, UserGroupInformation ugi, boolean allowSyntheticFileIds) throws IOException {
    List<SplitStrategy<?>> splitStrategies = new ArrayList<SplitStrategy<?>>();
    SplitStrategy<?> splitStrategy;
    boolean checkDefaultFs = HiveConf.getBoolVar(context.conf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID);
    boolean isDefaultFs = (!checkDefaultFs) || ((fs instanceof DistributedFileSystem) && HdfsUtils.isDefaultFs((DistributedFileSystem) fs));
    if (baseFiles.isEmpty()) {
        assert false : "acid 2.0 no base?!: " + dir;
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, Collections.emptyList(), false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
        return splitStrategies;
    }
    List<HdfsFileStatusWithId> acidSchemaFiles = new ArrayList<>();
    List<HdfsFileStatusWithId> originalSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
    // Separate the base files into acid schema and non-acid(original) schema files.
    for (AcidBaseFileInfo acidBaseFileInfo : baseFiles) {
        if (acidBaseFileInfo.isOriginal()) {
            originalSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        } else {
            assert acidBaseFileInfo.isAcidSchema();
            acidSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
        }
    }
    // Generate split strategy for non-acid schema original files, if any.
    if (!originalSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, originalSchemaFiles, true, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    // Generate split strategy for acid schema files, if any.
    if (!acidSchemaFiles.isEmpty()) {
        splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds, isDefaultFs);
        if (splitStrategy != null) {
            splitStrategies.add(splitStrategy);
        }
    }
    return splitStrategies;
}
Also used : AcidBaseFileInfo(org.apache.hadoop.hive.ql.io.AcidUtils.AcidBaseFileInfo) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) ArrayList(java.util.ArrayList) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 ArrayList (java.util.ArrayList)2 AcidBaseFileInfo (org.apache.hadoop.hive.ql.io.AcidUtils.AcidBaseFileInfo)2 HdfsFileStatusWithId (org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId)2 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)1