use of org.apache.hadoop.mapred.lib.CombineFileSplit in project hive by apache.
the class CombineHiveInputFormat method getCombineSplits.
/**
* Create Hive splits based on CombineFileSplit.
*/
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
init(job);
Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
InputSplit[] splits = null;
if (combine == null) {
splits = super.getSplits(job, numSplits);
return splits;
}
if (combine.getInputPathsShim(job).length == 0) {
throw new IOException("No input paths specified in job");
}
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
// combine splits only from same tables and same partitions. Do not combine splits from multiple
// tables or multiple partitions.
Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
List<Path> inpDirs = new ArrayList<Path>();
List<Path> inpFiles = new ArrayList<Path>();
Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
Set<Path> poolSet = new HashSet<Path>();
for (Path path : paths) {
PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
TableDesc tableDesc = part.getTableDesc();
if ((tableDesc != null) && tableDesc.isNonNative()) {
return super.getSplits(job, numSplits);
}
// Use HiveInputFormat if any of the paths is not splittable
Class inputFormatClass = part.getInputFileFormatClass();
String inputFormatClassName = inputFormatClass.getName();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
String deserializerClassName = null;
try {
deserializerClassName = part.getDeserializer(job).getClass().getName();
} catch (Exception e) {
// ignore
}
FileSystem inpFs = path.getFileSystem(job);
//don't combine if inputformat is a SymlinkTextInputFormat
if (inputFormat instanceof SymlinkTextInputFormat) {
splits = super.getSplits(job, numSplits);
return splits;
}
Path filterPath = path;
// Does a pool exist for this path already
CombineFilter f = null;
List<Operator<? extends OperatorDesc>> opList = null;
if (!mrwork.isMapperCannotSpanPartns()) {
//if mapper can span partitions, make sure a splits does not contain multiple
// opList + inputFormatClassName + deserializerClassName combination
// This is done using the Map of CombinePathInputFormat to PathFilter
opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
f = poolMap.get(combinePathInputFormat);
if (f == null) {
f = new CombineFilter(filterPath);
LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
combine.createPool(job, f);
poolMap.put(combinePathInputFormat, f);
} else {
LOG.info("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
f.addPath(filterPath);
}
} else {
// but won't cross multiple partitions if the user has asked so.
if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
// path is not directory
filterPath = path.getParent();
inpFiles.add(path);
poolSet.add(filterPath);
} else {
inpDirs.add(path);
}
}
}
// Processing directories
List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
if (!mrwork.isMapperCannotSpanPartns()) {
//mapper can span partitions
//combine into as few as one split, subject to the PathFilters set
// using combine.createPool.
iss = Arrays.asList(combine.getSplits(job, 1));
} else {
for (Path path : inpDirs) {
processPaths(job, combine, iss, path);
}
if (inpFiles.size() > 0) {
// Processing files
for (Path filterPath : poolSet) {
combine.createPool(job, new CombineFilter(filterPath));
}
processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
}
}
if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
iss = sampleSplits(iss);
}
for (CombineFileSplit is : iss) {
CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
result.add(csplit);
}
LOG.info("number of splits " + result.size());
return result.toArray(new CombineHiveInputSplit[result.size()]);
}
use of org.apache.hadoop.mapred.lib.CombineFileSplit in project hive by apache.
the class CombineHiveInputFormat method sampleSplits.
/**
* This function is used to sample inputs for clauses like "TABLESAMPLE(1 PERCENT)"
*
* First, splits are grouped by alias they are for. If one split serves more than one
* alias or not for any sampled alias, we just directly add it to returned list.
* Then we find a list of exclusive splits for every alias to be sampled.
* For each alias, we start from position of seedNumber%totalNumber, and keep add
* splits until the total size hits percentage.
* @param splits
* @return the sampled splits
*/
private List<CombineFileSplit> sampleSplits(List<CombineFileSplit> splits) {
HashMap<String, SplitSample> nameToSamples = mrwork.getNameToSplitSample();
List<CombineFileSplit> retLists = new ArrayList<CombineFileSplit>();
Map<String, ArrayList<CombineFileSplit>> aliasToSplitList = new HashMap<String, ArrayList<CombineFileSplit>>();
Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<Path, ArrayList<String>> pathToAliasesNoScheme = removeScheme(pathToAliases);
//
for (CombineFileSplit split : splits) {
String alias = null;
for (Path path : split.getPaths()) {
boolean schemeless = path.toUri().getScheme() == null;
List<String> l = HiveFileFormatUtils.doGetAliasesFromPath(schemeless ? pathToAliasesNoScheme : pathToAliases, path);
// 3. it serves different alias than another path for the same split
if (l.size() != 1 || !nameToSamples.containsKey(l.get(0)) || (alias != null && l.get(0) != alias)) {
alias = null;
break;
}
alias = l.get(0);
}
if (alias != null) {
// add it to the split list of the alias.
if (!aliasToSplitList.containsKey(alias)) {
aliasToSplitList.put(alias, new ArrayList<CombineFileSplit>());
}
aliasToSplitList.get(alias).add(split);
} else {
// The split doesn't exclusively serve one alias
retLists.add(split);
}
}
//
for (Map.Entry<String, ArrayList<CombineFileSplit>> entry : aliasToSplitList.entrySet()) {
ArrayList<CombineFileSplit> splitList = entry.getValue();
long totalSize = 0;
for (CombineFileSplit split : splitList) {
totalSize += split.getLength();
}
SplitSample splitSample = nameToSamples.get(entry.getKey());
long targetSize = splitSample.getTargetSize(totalSize);
int startIndex = splitSample.getSeedNum() % splitList.size();
long size = 0;
for (int i = 0; i < splitList.size(); i++) {
CombineFileSplit split = splitList.get((startIndex + i) % splitList.size());
retLists.add(split);
long splitgLength = split.getLength();
if (size + splitgLength >= targetSize) {
LOG.info("Sample alias " + entry.getValue() + " using " + (i + 1) + "splits");
if (size + splitgLength > targetSize) {
((InputSplitShim) split).shrinkSplit(targetSize - size);
}
break;
}
size += splitgLength;
}
}
return retLists;
}
Aggregations