Search in sources :

Example 1 with InputSplitShim

use of org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim in project hive by apache.

the class CombineHiveInputFormat method sampleSplits.

/**
   * This function is used to sample inputs for clauses like "TABLESAMPLE(1 PERCENT)"
   *
   * First, splits are grouped by alias they are for. If one split serves more than one
   * alias or not for any sampled alias, we just directly add it to returned list.
   * Then we find a list of exclusive splits for every alias to be sampled.
   * For each alias, we start from position of seedNumber%totalNumber, and keep add
   * splits until the total size hits percentage.
   * @param splits
   * @return the sampled splits
   */
private List<CombineFileSplit> sampleSplits(List<CombineFileSplit> splits) {
    HashMap<String, SplitSample> nameToSamples = mrwork.getNameToSplitSample();
    List<CombineFileSplit> retLists = new ArrayList<CombineFileSplit>();
    Map<String, ArrayList<CombineFileSplit>> aliasToSplitList = new HashMap<String, ArrayList<CombineFileSplit>>();
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<Path, ArrayList<String>> pathToAliasesNoScheme = removeScheme(pathToAliases);
    //
    for (CombineFileSplit split : splits) {
        String alias = null;
        for (Path path : split.getPaths()) {
            boolean schemeless = path.toUri().getScheme() == null;
            List<String> l = HiveFileFormatUtils.doGetAliasesFromPath(schemeless ? pathToAliasesNoScheme : pathToAliases, path);
            // 3. it serves different alias than another path for the same split
            if (l.size() != 1 || !nameToSamples.containsKey(l.get(0)) || (alias != null && l.get(0) != alias)) {
                alias = null;
                break;
            }
            alias = l.get(0);
        }
        if (alias != null) {
            // add it to the split list of the alias.
            if (!aliasToSplitList.containsKey(alias)) {
                aliasToSplitList.put(alias, new ArrayList<CombineFileSplit>());
            }
            aliasToSplitList.get(alias).add(split);
        } else {
            // The split doesn't exclusively serve one alias
            retLists.add(split);
        }
    }
    //
    for (Map.Entry<String, ArrayList<CombineFileSplit>> entry : aliasToSplitList.entrySet()) {
        ArrayList<CombineFileSplit> splitList = entry.getValue();
        long totalSize = 0;
        for (CombineFileSplit split : splitList) {
            totalSize += split.getLength();
        }
        SplitSample splitSample = nameToSamples.get(entry.getKey());
        long targetSize = splitSample.getTargetSize(totalSize);
        int startIndex = splitSample.getSeedNum() % splitList.size();
        long size = 0;
        for (int i = 0; i < splitList.size(); i++) {
            CombineFileSplit split = splitList.get((startIndex + i) % splitList.size());
            retLists.add(split);
            long splitgLength = split.getLength();
            if (size + splitgLength >= targetSize) {
                LOG.info("Sample alias " + entry.getValue() + " using " + (i + 1) + "splits");
                if (size + splitgLength > targetSize) {
                    ((InputSplitShim) split).shrinkSplit(targetSize - size);
                }
                break;
            }
            size += splitgLength;
        }
    }
    return retLists;
}
Also used : Path(org.apache.hadoop.fs.Path) SplitSample(org.apache.hadoop.hive.ql.parse.SplitSample) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapred.lib.CombineFileSplit) InputSplitShim(org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Path (org.apache.hadoop.fs.Path)1 SplitSample (org.apache.hadoop.hive.ql.parse.SplitSample)1 InputSplitShim (org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim)1 CombineFileSplit (org.apache.hadoop.mapred.lib.CombineFileSplit)1