use of org.apache.hadoop.mapreduce.InputFormat in project druid by druid-io.
the class DruidOrcInputFormatTest method testRead.
@Test
public void testRead() throws IOException, InterruptedException {
InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());
TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader reader = inputFormat.createRecordReader(split, context);
OrcHadoopInputRowParser parser = (OrcHadoopInputRowParser) config.getParser();
reader.initialize(split, context);
reader.nextKeyValue();
OrcStruct data = (OrcStruct) reader.getCurrentValue();
MapBasedInputRow row = (MapBasedInputRow) parser.parse(data);
Assert.assertTrue(row.getEvent().keySet().size() == 4);
Assert.assertEquals(new DateTime(timestamp), row.getTimestamp());
Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
Assert.assertEquals(col1, row.getEvent().get("col1"));
Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));
reader.close();
}
use of org.apache.hadoop.mapreduce.InputFormat in project crunch by cloudera.
the class CrunchInputFormat method getSplits.
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
List<InputSplit> splits = Lists.newArrayList();
Configuration conf = job.getConfiguration();
Map<InputBundle, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs.getFormatNodeMap(job);
// First, build a map of InputFormats to Paths
for (Map.Entry<InputBundle, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
InputBundle inputBundle = entry.getKey();
Job jobCopy = new Job(conf);
InputFormat<?, ?> format = (InputFormat<?, ?>) ReflectionUtils.newInstance(inputBundle.getInputFormatClass(), jobCopy.getConfiguration());
for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
Integer nodeIndex = nodeEntry.getKey();
List<Path> paths = nodeEntry.getValue();
FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));
// Get splits for each input path and tag with InputFormat
// and Mapper types by wrapping in a TaggedInputSplit.
List<InputSplit> pathSplits = format.getSplits(jobCopy);
for (InputSplit pathSplit : pathSplits) {
splits.add(new CrunchInputSplit(pathSplit, inputBundle.getInputFormatClass(), inputBundle.getExtraConfiguration(), nodeIndex, jobCopy.getConfiguration()));
}
}
}
return splits;
}
use of org.apache.hadoop.mapreduce.InputFormat in project hadoop by apache.
the class MultipleInputs method getInputFormatMap.
/**
* Retrieves a map of {@link Path}s to the {@link InputFormat} class
* that should be used for them.
*
* @param job The {@link JobContext}
* @see #addInputPath(JobConf, Path, Class)
* @return A map of paths to inputformats for the job
*/
@SuppressWarnings("unchecked")
static Map<Path, InputFormat> getInputFormatMap(JobContext job) {
Map<Path, InputFormat> m = new HashMap<Path, InputFormat>();
Configuration conf = job.getConfiguration();
String[] pathMappings = conf.get(DIR_FORMATS).split(",");
for (String pathMapping : pathMappings) {
String[] split = pathMapping.split(";");
InputFormat inputFormat;
try {
inputFormat = (InputFormat) ReflectionUtils.newInstance(conf.getClassByName(split[1]), conf);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
m.put(new Path(split[0]), inputFormat);
}
return m;
}
use of org.apache.hadoop.mapreduce.InputFormat in project hadoop by apache.
the class DelegatingInputFormat method getSplits.
@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
Job jobCopy = Job.getInstance(conf);
List<InputSplit> splits = new ArrayList<InputSplit>();
Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(job);
Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(job);
Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>();
// First, build a map of InputFormats to Paths
for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
if (!formatPaths.containsKey(entry.getValue().getClass())) {
formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
}
formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
}
for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
Class<? extends InputFormat> formatClass = formatEntry.getKey();
InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
List<Path> paths = formatEntry.getValue();
Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<Class<? extends Mapper>, List<Path>>();
// a map of Mappers to the paths they're used for
for (Path path : paths) {
Class<? extends Mapper> mapperClass = mapperMap.get(path);
if (!mapperPaths.containsKey(mapperClass)) {
mapperPaths.put(mapperClass, new LinkedList<Path>());
}
mapperPaths.get(mapperClass).add(path);
}
// be added to the same job, and split together.
for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
paths = mapEntry.getValue();
Class<? extends Mapper> mapperClass = mapEntry.getKey();
if (mapperClass == null) {
try {
mapperClass = job.getMapperClass();
} catch (ClassNotFoundException e) {
throw new IOException("Mapper class is not found", e);
}
}
FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));
// Get splits for each input path and tag with InputFormat
// and Mapper types by wrapping in a TaggedInputSplit.
List<InputSplit> pathSplits = format.getSplits(jobCopy);
for (InputSplit pathSplit : pathSplits) {
splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
}
}
}
return splits;
}
use of org.apache.hadoop.mapreduce.InputFormat in project hadoop by apache.
the class TestCombineFileInputFormat method testReinit.
@Test
public void testReinit() throws Exception {
// Test that a split containing multiple files works correctly,
// with the child RecordReader getting its initialize() method
// called a second time.
TaskAttemptID taskId = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
Configuration conf = new Configuration();
TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId);
// This will create a CombineFileRecordReader that itself contains a
// DummyRecordReader.
InputFormat inputFormat = new ChildRRInputFormat();
Path[] files = { new Path("file1"), new Path("file2") };
long[] lengths = { 1, 1 };
CombineFileSplit split = new CombineFileSplit(files, lengths);
RecordReader rr = inputFormat.createRecordReader(split, context);
assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);
// first initialize() call comes from MapTask. We'll do it here.
rr.initialize(split, context);
// First value is first filename.
assertTrue(rr.nextKeyValue());
assertEquals("file1", rr.getCurrentValue().toString());
// The inner RR will return false, because it only emits one (k, v) pair.
// But there's another sub-split to process. This returns true to us.
assertTrue(rr.nextKeyValue());
// And the 2nd rr will have its initialize method called correctly.
assertEquals("file2", rr.getCurrentValue().toString());
// But after both child RR's have returned their singleton (k, v), this
// should also return false.
assertFalse(rr.nextKeyValue());
}
Aggregations