use of org.apache.hadoop.mapred.InputFormat in project incubator-systemml by apache.
the class DelegatingInputFormat method getRecordReader.
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException {
// Find the InputFormat and then the RecordReader from the
// TaggedInputSplit.
TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils.newInstance(taggedInputSplit.getInputFormatClass(), conf);
InputSplit inputSplit = taggedInputSplit.getInputSplit();
if (inputSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) inputSplit;
conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString());
conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart());
conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength());
}
return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter);
}
use of org.apache.hadoop.mapred.InputFormat in project incubator-systemml by apache.
the class DelegatingInputFormat method getSplits.
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
JobConf confCopy = new JobConf(conf);
List<InputSplit> splits = new ArrayList<>();
Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(conf);
Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(conf);
Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<>();
// First, build a map of InputFormats to Paths
for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
if (!formatPaths.containsKey(entry.getValue().getClass())) {
formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
}
formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
}
for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
Class<? extends InputFormat> formatClass = formatEntry.getKey();
InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
List<Path> paths = formatEntry.getValue();
Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<>();
// a map of Mappers to the paths they're used for
for (Path path : paths) {
Class<? extends Mapper> mapperClass = mapperMap.get(path);
if (!mapperPaths.containsKey(mapperClass)) {
mapperPaths.put(mapperClass, new LinkedList<Path>());
}
mapperPaths.get(mapperClass).add(path);
}
// be added to the same job, and split together.
for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
paths = mapEntry.getValue();
Class<? extends Mapper> mapperClass = mapEntry.getKey();
if (mapperClass == null) {
mapperClass = conf.getMapperClass();
}
FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths.size()]));
// Get splits for each input path and tag with InputFormat
// and Mapper types by wrapping in a TaggedInputSplit.
InputSplit[] pathSplits = format.getSplits(confCopy, numSplits);
for (InputSplit pathSplit : pathSplits) {
splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
}
}
}
return splits.toArray(new InputSplit[splits.size()]);
}
use of org.apache.hadoop.mapred.InputFormat in project incubator-systemml by apache.
the class MultipleInputs method getInputFormatMap.
/**
* Retrieves a map of {@link Path}s to the {@link InputFormat} class
* that should be used for them.
*
* @param conf The confuration of the job
* @see #addInputPath(JobConf, Path, Class)
* @return A map of paths to inputformats for the job
*/
static Map<Path, InputFormat> getInputFormatMap(JobConf conf) {
Map<Path, InputFormat> m = new HashMap<>();
String[] pathMappings = conf.get(MRConfigurationNames.MR_INPUT_MULTIPLEINPUTS_DIR_FORMATS).split(",");
for (String pathMapping : pathMappings) {
String[] split = pathMapping.split(";");
InputFormat inputFormat;
try {
inputFormat = (InputFormat) ReflectionUtils.newInstance(conf.getClassByName(split[1]), conf);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
m.put(new Path(split[0]), inputFormat);
}
return m;
}
use of org.apache.hadoop.mapred.InputFormat in project hive by apache.
the class HiveIndexedInputFormat method doGetSplits.
public InputSplit[] doGetSplits(JobConf job, int numSplits) throws IOException {
super.init(job);
Path[] dirs = FileInputFormat.getInputPaths(job);
if (dirs.length == 0) {
throw new IOException("No input paths specified in job");
}
JobConf newjob = new JobConf(job);
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
// for each dir, get the InputFormat, and do getSplits.
PartitionDesc part;
for (Path dir : dirs) {
part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, IOPrepareCache.get().allocatePartitionDescMap(), true);
// create a new InputFormat instance if this is the first time to see this
// class
Class inputFormatClass = part.getInputFileFormatClass();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), newjob);
FileInputFormat.setInputPaths(newjob, dir);
newjob.setInputFormat(inputFormat.getClass());
InputSplit[] iss = inputFormat.getSplits(newjob, numSplits / dirs.length);
for (InputSplit is : iss) {
result.add(new HiveInputSplit(is, inputFormatClass.getName()));
}
}
return result.toArray(new HiveInputSplit[result.size()]);
}
use of org.apache.hadoop.mapred.InputFormat in project hive by apache.
the class BucketizedHiveInputFormat method getRecordReader.
@Override
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
BucketizedHiveInputSplit hsplit = (BucketizedHiveInputSplit) split;
String inputFormatClassName = null;
Class inputFormatClass = null;
try {
inputFormatClassName = hsplit.inputFormatClassName();
inputFormatClass = job.getClassByName(inputFormatClassName);
} catch (Exception e) {
throw new IOException("cannot find class " + inputFormatClassName);
}
pushProjectionsAndFilters(job, inputFormatClass, hsplit.getPath());
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
BucketizedHiveRecordReader<K, V> rr = new BucketizedHiveRecordReader(inputFormat, hsplit, job, reporter);
rr.initIOContext(hsplit, job, inputFormatClass);
return rr;
}
Aggregations