use of org.apache.hadoop.hive.shims.HadoopShimsSecure.InputSplitShim in project hive by apache.
the class CombineHiveInputFormat method sampleSplits.
/**
* This function is used to sample inputs for clauses like "TABLESAMPLE(1 PERCENT)"
*
* First, splits are grouped by alias they are for. If one split serves more than one
* alias or not for any sampled alias, we just directly add it to returned list.
* Then we find a list of exclusive splits for every alias to be sampled.
* For each alias, we start from position of seedNumber%totalNumber, and keep add
* splits until the total size hits percentage.
* @param splits
* @return the sampled splits
*/
private List<CombineFileSplit> sampleSplits(List<CombineFileSplit> splits) {
HashMap<String, SplitSample> nameToSamples = mrwork.getNameToSplitSample();
List<CombineFileSplit> retLists = new ArrayList<CombineFileSplit>();
Map<String, ArrayList<CombineFileSplit>> aliasToSplitList = new HashMap<String, ArrayList<CombineFileSplit>>();
Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<Path, ArrayList<String>> pathToAliasesNoScheme = removeScheme(pathToAliases);
//
for (CombineFileSplit split : splits) {
String alias = null;
for (Path path : split.getPaths()) {
boolean schemeless = path.toUri().getScheme() == null;
List<String> l = HiveFileFormatUtils.doGetAliasesFromPath(schemeless ? pathToAliasesNoScheme : pathToAliases, path);
// 3. it serves different alias than another path for the same split
if (l.size() != 1 || !nameToSamples.containsKey(l.get(0)) || (alias != null && l.get(0) != alias)) {
alias = null;
break;
}
alias = l.get(0);
}
if (alias != null) {
// add it to the split list of the alias.
if (!aliasToSplitList.containsKey(alias)) {
aliasToSplitList.put(alias, new ArrayList<CombineFileSplit>());
}
aliasToSplitList.get(alias).add(split);
} else {
// The split doesn't exclusively serve one alias
retLists.add(split);
}
}
//
for (Map.Entry<String, ArrayList<CombineFileSplit>> entry : aliasToSplitList.entrySet()) {
ArrayList<CombineFileSplit> splitList = entry.getValue();
long totalSize = 0;
for (CombineFileSplit split : splitList) {
totalSize += split.getLength();
}
SplitSample splitSample = nameToSamples.get(entry.getKey());
long targetSize = splitSample.getTargetSize(totalSize);
int startIndex = splitSample.getSeedNum() % splitList.size();
long size = 0;
for (int i = 0; i < splitList.size(); i++) {
CombineFileSplit split = splitList.get((startIndex + i) % splitList.size());
retLists.add(split);
long splitgLength = split.getLength();
if (size + splitgLength >= targetSize) {
LOG.info("Sample alias " + entry.getValue() + " using " + (i + 1) + "splits");
if (size + splitgLength > targetSize) {
((InputSplitShim) split).shrinkSplit(targetSize - size);
}
break;
}
size += splitgLength;
}
}
return retLists;
}
Aggregations