Search in sources :

Example 1 with CombineFileSplit

use of org.apache.hadoop.mapred.lib.CombineFileSplit in project hive by apache.

the class CombineHiveInputFormat method getCombineSplits.

/**
   * Create Hive splits based on CombineFileSplit.
   */
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
    init(job);
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
    InputSplit[] splits = null;
    if (combine == null) {
        splits = super.getSplits(job, numSplits);
        return splits;
    }
    if (combine.getInputPathsShim(job).length == 0) {
        throw new IOException("No input paths specified in job");
    }
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();
    for (Path path : paths) {
        PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
        TableDesc tableDesc = part.getTableDesc();
        if ((tableDesc != null) && tableDesc.isNonNative()) {
            return super.getSplits(job, numSplits);
        }
        // Use HiveInputFormat if any of the paths is not splittable
        Class inputFormatClass = part.getInputFileFormatClass();
        String inputFormatClassName = inputFormatClass.getName();
        InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
        String deserializerClassName = null;
        try {
            deserializerClassName = part.getDeserializer(job).getClass().getName();
        } catch (Exception e) {
        // ignore
        }
        FileSystem inpFs = path.getFileSystem(job);
        //don't combine if inputformat is a SymlinkTextInputFormat
        if (inputFormat instanceof SymlinkTextInputFormat) {
            splits = super.getSplits(job, numSplits);
            return splits;
        }
        Path filterPath = path;
        // Does a pool exist for this path already
        CombineFilter f = null;
        List<Operator<? extends OperatorDesc>> opList = null;
        if (!mrwork.isMapperCannotSpanPartns()) {
            //if mapper can span partitions, make sure a splits does not contain multiple
            // opList + inputFormatClassName + deserializerClassName combination
            // This is done using the Map of CombinePathInputFormat to PathFilter
            opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
            CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
            f = poolMap.get(combinePathInputFormat);
            if (f == null) {
                f = new CombineFilter(filterPath);
                LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
                combine.createPool(job, f);
                poolMap.put(combinePathInputFormat, f);
            } else {
                LOG.info("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
                f.addPath(filterPath);
            }
        } else {
            // but won't cross multiple partitions if the user has asked so.
            if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
                // path is not directory
                filterPath = path.getParent();
                inpFiles.add(path);
                poolSet.add(filterPath);
            } else {
                inpDirs.add(path);
            }
        }
    }
    // Processing directories
    List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
    if (!mrwork.isMapperCannotSpanPartns()) {
        //mapper can span partitions
        //combine into as few as one split, subject to the PathFilters set
        // using combine.createPool.
        iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
        for (Path path : inpDirs) {
            processPaths(job, combine, iss, path);
        }
        if (inpFiles.size() > 0) {
            // Processing files
            for (Path filterPath : poolSet) {
                combine.createPool(job, new CombineFilter(filterPath));
            }
            processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
        }
    }
    if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
        iss = sampleSplits(iss);
    }
    for (CombineFileSplit is : iss) {
        CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
        result.add(csplit);
    }
    LOG.info("number of splits " + result.size());
    return result.toArray(new CombineHiveInputSplit[result.size()]);
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapred.lib.CombineFileSplit) CombineFileInputFormatShim(org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 2 with CombineFileSplit

use of org.apache.hadoop.mapred.lib.CombineFileSplit in project hive by apache.

the class CombineHiveInputFormat method sampleSplits.

/**
   * This function is used to sample inputs for clauses like "TABLESAMPLE(1 PERCENT)"
   *
   * First, splits are grouped by alias they are for. If one split serves more than one
   * alias or not for any sampled alias, we just directly add it to returned list.
   * Then we find a list of exclusive splits for every alias to be sampled.
   * For each alias, we start from position of seedNumber%totalNumber, and keep add
   * splits until the total size hits percentage.
   * @param splits
   * @return the sampled splits
   */
private List<CombineFileSplit> sampleSplits(List<CombineFileSplit> splits) {
    HashMap<String, SplitSample> nameToSamples = mrwork.getNameToSplitSample();
    List<CombineFileSplit> retLists = new ArrayList<CombineFileSplit>();
    Map<String, ArrayList<CombineFileSplit>> aliasToSplitList = new HashMap<String, ArrayList<CombineFileSplit>>();
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<Path, ArrayList<String>> pathToAliasesNoScheme = removeScheme(pathToAliases);
    //
    for (CombineFileSplit split : splits) {
        String alias = null;
        for (Path path : split.getPaths()) {
            boolean schemeless = path.toUri().getScheme() == null;
            List<String> l = HiveFileFormatUtils.doGetAliasesFromPath(schemeless ? pathToAliasesNoScheme : pathToAliases, path);
            // 3. it serves different alias than another path for the same split
            if (l.size() != 1 || !nameToSamples.containsKey(l.get(0)) || (alias != null && l.get(0) != alias)) {
                alias = null;
                break;
            }
            alias = l.get(0);
        }
        if (alias != null) {
            // add it to the split list of the alias.
            if (!aliasToSplitList.containsKey(alias)) {
                aliasToSplitList.put(alias, new ArrayList<CombineFileSplit>());
            }
            aliasToSplitList.get(alias).add(split);
        } else {
            // The split doesn't exclusively serve one alias
            retLists.add(split);
        }
    }
    //
    for (Map.Entry<String, ArrayList<CombineFileSplit>> entry : aliasToSplitList.entrySet()) {
        ArrayList<CombineFileSplit> splitList = entry.getValue();
        long totalSize = 0;
        for (CombineFileSplit split : splitList) {
            totalSize += split.getLength();
        }
        SplitSample splitSample = nameToSamples.get(entry.getKey());
        long targetSize = splitSample.getTargetSize(totalSize);
        int startIndex = splitSample.getSeedNum() % splitList.size();
        long size = 0;
        for (int i = 0; i < splitList.size(); i++) {
            CombineFileSplit split = splitList.get((startIndex + i) % splitList.size());
            retLists.add(split);
            long splitgLength = split.getLength();
            if (size + splitgLength >= targetSize) {
                LOG.info("Sample alias " + entry.getValue() + " using " + (i + 1) + "splits");
                if (size + splitgLength > targetSize) {
                    ((InputSplitShim) split).shrinkSplit(targetSize - size);
                }
                break;
            }
            size += splitgLength;
        }
    }
    return retLists;
}
Also used : Path(org.apache.hadoop.fs.Path) SplitSample(org.apache.hadoop.hive.ql.parse.SplitSample) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapred.lib.CombineFileSplit) InputSplitShim(org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 Path (org.apache.hadoop.fs.Path)2 CombineFileSplit (org.apache.hadoop.mapred.lib.CombineFileSplit)2 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 ExecutionException (java.util.concurrent.ExecutionException)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Operator (org.apache.hadoop.hive.ql.exec.Operator)1 SplitSample (org.apache.hadoop.hive.ql.parse.SplitSample)1 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)1 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)1 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)1 CombineFileInputFormatShim (org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim)1 InputSplitShim (org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim)1 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)1 InputFormat (org.apache.hadoop.mapred.InputFormat)1 InputSplit (org.apache.hadoop.mapred.InputSplit)1