use of org.apache.crunch.io.impl.InputBundle in project crunch by cloudera.
the class CrunchInputs method getFormatNodeMap.
public static Map<InputBundle, Map<Integer, List<Path>>> getFormatNodeMap(JobContext job) {
Map<InputBundle, Map<Integer, List<Path>>> formatNodeMap = Maps.newHashMap();
Configuration conf = job.getConfiguration();
for (String input : Splitter.on(RECORD_SEP).split(conf.get(RuntimeParameters.MULTI_INPUTS))) {
List<String> fields = Lists.newArrayList(SPLITTER.split(input));
InputBundle inputBundle = InputBundle.fromSerialized(fields.get(0));
if (!formatNodeMap.containsKey(inputBundle)) {
formatNodeMap.put(inputBundle, Maps.<Integer, List<Path>>newHashMap());
}
Integer nodeIndex = Integer.valueOf(fields.get(1));
if (!formatNodeMap.get(inputBundle).containsKey(nodeIndex)) {
formatNodeMap.get(inputBundle).put(nodeIndex, Lists.<Path>newLinkedList());
}
formatNodeMap.get(inputBundle).get(nodeIndex).add(new Path(fields.get(2)));
}
return formatNodeMap;
}
use of org.apache.crunch.io.impl.InputBundle in project crunch by cloudera.
the class CrunchInputFormat method getSplits.
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
List<InputSplit> splits = Lists.newArrayList();
Configuration conf = job.getConfiguration();
Map<InputBundle, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs.getFormatNodeMap(job);
// First, build a map of InputFormats to Paths
for (Map.Entry<InputBundle, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
InputBundle inputBundle = entry.getKey();
Job jobCopy = new Job(conf);
InputFormat<?, ?> format = (InputFormat<?, ?>) ReflectionUtils.newInstance(inputBundle.getInputFormatClass(), jobCopy.getConfiguration());
for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
Integer nodeIndex = nodeEntry.getKey();
List<Path> paths = nodeEntry.getValue();
FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));
// Get splits for each input path and tag with InputFormat
// and Mapper types by wrapping in a TaggedInputSplit.
List<InputSplit> pathSplits = format.getSplits(jobCopy);
for (InputSplit pathSplit : pathSplits) {
splits.add(new CrunchInputSplit(pathSplit, inputBundle.getInputFormatClass(), inputBundle.getExtraConfiguration(), nodeIndex, jobCopy.getConfiguration()));
}
}
}
return splits;
}
Aggregations