Search in sources :

Example 1 with CombineFileInputFormatShim

use of org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim in project hive by apache.

the class CombineHiveInputFormat method getCombineSplits.

/**
   * Create Hive splits based on CombineFileSplit.
   */
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
    init(job);
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
    InputSplit[] splits = null;
    if (combine == null) {
        splits = super.getSplits(job, numSplits);
        return splits;
    }
    if (combine.getInputPathsShim(job).length == 0) {
        throw new IOException("No input paths specified in job");
    }
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();
    for (Path path : paths) {
        PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
        TableDesc tableDesc = part.getTableDesc();
        if ((tableDesc != null) && tableDesc.isNonNative()) {
            return super.getSplits(job, numSplits);
        }
        // Use HiveInputFormat if any of the paths is not splittable
        Class inputFormatClass = part.getInputFileFormatClass();
        String inputFormatClassName = inputFormatClass.getName();
        InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
        String deserializerClassName = null;
        try {
            deserializerClassName = part.getDeserializer(job).getClass().getName();
        } catch (Exception e) {
        // ignore
        }
        FileSystem inpFs = path.getFileSystem(job);
        //don't combine if inputformat is a SymlinkTextInputFormat
        if (inputFormat instanceof SymlinkTextInputFormat) {
            splits = super.getSplits(job, numSplits);
            return splits;
        }
        Path filterPath = path;
        // Does a pool exist for this path already
        CombineFilter f = null;
        List<Operator<? extends OperatorDesc>> opList = null;
        if (!mrwork.isMapperCannotSpanPartns()) {
            //if mapper can span partitions, make sure a splits does not contain multiple
            // opList + inputFormatClassName + deserializerClassName combination
            // This is done using the Map of CombinePathInputFormat to PathFilter
            opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
            CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
            f = poolMap.get(combinePathInputFormat);
            if (f == null) {
                f = new CombineFilter(filterPath);
                LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
                combine.createPool(job, f);
                poolMap.put(combinePathInputFormat, f);
            } else {
                LOG.info("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
                f.addPath(filterPath);
            }
        } else {
            // but won't cross multiple partitions if the user has asked so.
            if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
                // path is not directory
                filterPath = path.getParent();
                inpFiles.add(path);
                poolSet.add(filterPath);
            } else {
                inpDirs.add(path);
            }
        }
    }
    // Processing directories
    List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
    if (!mrwork.isMapperCannotSpanPartns()) {
        //mapper can span partitions
        //combine into as few as one split, subject to the PathFilters set
        // using combine.createPool.
        iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
        for (Path path : inpDirs) {
            processPaths(job, combine, iss, path);
        }
        if (inpFiles.size() > 0) {
            // Processing files
            for (Path filterPath : poolSet) {
                combine.createPool(job, new CombineFilter(filterPath));
            }
            processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
        }
    }
    if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
        iss = sampleSplits(iss);
    }
    for (CombineFileSplit is : iss) {
        CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
        result.add(csplit);
    }
    LOG.info("number of splits " + result.size());
    return result.toArray(new CombineHiveInputSplit[result.size()]);
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapred.lib.CombineFileSplit) CombineFileInputFormatShim(org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 ExecutionException (java.util.concurrent.ExecutionException)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 Operator (org.apache.hadoop.hive.ql.exec.Operator)1 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)1 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)1 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)1 CombineFileInputFormatShim (org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim)1 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)1 InputFormat (org.apache.hadoop.mapred.InputFormat)1 InputSplit (org.apache.hadoop.mapred.InputSplit)1 CombineFileSplit (org.apache.hadoop.mapred.lib.CombineFileSplit)1