use of org.apache.hadoop.mapred.JobConf in project hadoop by apache.
the class TestMultipleOutputs method setUp.
@Before
public void setUp() throws Exception {
super.setUp();
Path rootDir = getDir(ROOT_DIR);
Path inDir = getDir(IN_DIR);
JobConf conf = createJobConf();
FileSystem fs = FileSystem.get(conf);
fs.delete(rootDir, true);
if (!fs.mkdirs(inDir)) {
throw new IOException("Mkdirs failed to create " + inDir.toString());
}
}
use of org.apache.hadoop.mapred.JobConf in project hadoop by apache.
the class TestMultipleOutputs method _testMOWithJavaSerialization.
protected void _testMOWithJavaSerialization(boolean withCounters) throws Exception {
Path inDir = getDir(IN_DIR);
Path outDir = getDir(OUT_DIR);
JobConf conf = createJobConf();
FileSystem fs = FileSystem.get(conf);
DataOutputStream file = fs.create(new Path(inDir, "part-0"));
file.writeBytes("a\nb\n\nc\nd\ne");
file.close();
fs.delete(inDir, true);
fs.delete(outDir, true);
file = fs.create(new Path(inDir, "part-1"));
file.writeBytes("a\nb\n\nc\nd\ne");
file.close();
conf.setJobName("mo");
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
conf.setInputFormat(TextInputFormat.class);
conf.setMapOutputKeyClass(Long.class);
conf.setMapOutputValueClass(String.class);
conf.setOutputKeyComparatorClass(JavaSerializationComparator.class);
conf.setOutputKeyClass(Long.class);
conf.setOutputValueClass(String.class);
conf.setOutputFormat(TextOutputFormat.class);
MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class, Long.class, String.class);
MultipleOutputs.setCountersEnabled(conf, withCounters);
conf.setMapperClass(MOJavaSerDeMap.class);
conf.setReducerClass(MOJavaSerDeReduce.class);
FileInputFormat.setInputPaths(conf, inDir);
FileOutputFormat.setOutputPath(conf, outDir);
JobClient jc = new JobClient(conf);
RunningJob job = jc.submitJob(conf);
while (!job.isComplete()) {
Thread.sleep(100);
}
// assert number of named output part files
int namedOutputCount = 0;
FileStatus[] statuses = fs.listStatus(outDir);
for (FileStatus status : statuses) {
if (status.getPath().getName().equals("text-m-00000") || status.getPath().getName().equals("text-r-00000")) {
namedOutputCount++;
}
}
assertEquals(2, namedOutputCount);
// assert TextOutputFormat files correctness
BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(new Path(FileOutputFormat.getOutputPath(conf), "text-r-00000"))));
int count = 0;
String line = reader.readLine();
while (line != null) {
assertTrue(line.endsWith("text"));
line = reader.readLine();
count++;
}
reader.close();
assertFalse(count == 0);
Counters.Group counters = job.getCounters().getGroup(MultipleOutputs.class.getName());
if (!withCounters) {
assertEquals(0, counters.size());
} else {
assertEquals(1, counters.size());
assertEquals(2, counters.getCounter("text"));
}
}
use of org.apache.hadoop.mapred.JobConf in project hadoop by apache.
the class TestMultipleOutputs method _testMultipleOutputs.
protected void _testMultipleOutputs(boolean withCounters) throws Exception {
Path inDir = getDir(IN_DIR);
Path outDir = getDir(OUT_DIR);
JobConf conf = createJobConf();
FileSystem fs = FileSystem.get(conf);
DataOutputStream file = fs.create(new Path(inDir, "part-0"));
file.writeBytes("a\nb\n\nc\nd\ne");
file.close();
file = fs.create(new Path(inDir, "part-1"));
file.writeBytes("a\nb\n\nc\nd\ne");
file.close();
conf.setJobName("mo");
conf.setInputFormat(TextInputFormat.class);
conf.setOutputKeyClass(LongWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapOutputKeyClass(LongWritable.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputFormat(TextOutputFormat.class);
MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class, LongWritable.class, Text.class);
MultipleOutputs.addMultiNamedOutput(conf, "sequence", SequenceFileOutputFormat.class, LongWritable.class, Text.class);
MultipleOutputs.setCountersEnabled(conf, withCounters);
conf.setMapperClass(MOMap.class);
conf.setReducerClass(MOReduce.class);
FileInputFormat.setInputPaths(conf, inDir);
FileOutputFormat.setOutputPath(conf, outDir);
JobClient jc = new JobClient(conf);
RunningJob job = jc.submitJob(conf);
while (!job.isComplete()) {
Thread.sleep(100);
}
// assert number of named output part files
int namedOutputCount = 0;
FileStatus[] statuses = fs.listStatus(outDir);
for (FileStatus status : statuses) {
if (status.getPath().getName().equals("text-m-00000") || status.getPath().getName().equals("text-m-00001") || status.getPath().getName().equals("text-r-00000") || status.getPath().getName().equals("sequence_A-m-00000") || status.getPath().getName().equals("sequence_A-m-00001") || status.getPath().getName().equals("sequence_B-m-00000") || status.getPath().getName().equals("sequence_B-m-00001") || status.getPath().getName().equals("sequence_B-r-00000") || status.getPath().getName().equals("sequence_C-r-00000")) {
namedOutputCount++;
}
}
assertEquals(9, namedOutputCount);
// assert TextOutputFormat files correctness
BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(new Path(FileOutputFormat.getOutputPath(conf), "text-r-00000"))));
int count = 0;
String line = reader.readLine();
while (line != null) {
assertTrue(line.endsWith("text"));
line = reader.readLine();
count++;
}
reader.close();
assertFalse(count == 0);
// assert SequenceOutputFormat files correctness
SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, new Path(FileOutputFormat.getOutputPath(conf), "sequence_B-r-00000"), conf);
assertEquals(LongWritable.class, seqReader.getKeyClass());
assertEquals(Text.class, seqReader.getValueClass());
count = 0;
LongWritable key = new LongWritable();
Text value = new Text();
while (seqReader.next(key, value)) {
assertEquals("sequence", value.toString());
count++;
}
seqReader.close();
assertFalse(count == 0);
Counters.Group counters = job.getCounters().getGroup(MultipleOutputs.class.getName());
if (!withCounters) {
assertEquals(0, counters.size());
} else {
assertEquals(4, counters.size());
assertEquals(4, counters.getCounter("text"));
assertEquals(2, counters.getCounter("sequence_A"));
assertEquals(4, counters.getCounter("sequence_B"));
assertEquals(2, counters.getCounter("sequence_C"));
}
}
use of org.apache.hadoop.mapred.JobConf in project hadoop by apache.
the class TestNewCombinerGrouping method testCombiner.
@Test
public void testCombiner() throws Exception {
if (!new File(TEST_ROOT_DIR).mkdirs()) {
throw new RuntimeException("Could not create test dir: " + TEST_ROOT_DIR);
}
File in = new File(TEST_ROOT_DIR, "input");
if (!in.mkdirs()) {
throw new RuntimeException("Could not create test dir: " + in);
}
File out = new File(TEST_ROOT_DIR, "output");
PrintWriter pw = new PrintWriter(new FileWriter(new File(in, "data.txt")));
pw.println("A|a,1");
pw.println("A|b,2");
pw.println("B|a,3");
pw.println("B|b,4");
pw.println("B|c,5");
pw.close();
JobConf conf = new JobConf();
conf.set("mapreduce.framework.name", "local");
Job job = new Job(conf);
TextInputFormat.setInputPaths(job, new Path(in.getPath()));
TextOutputFormat.setOutputPath(job, new Path(out.getPath()));
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setGroupingComparatorClass(GroupComparator.class);
job.setCombinerKeyGroupingComparatorClass(GroupComparator.class);
job.setCombinerClass(Combiner.class);
job.getConfiguration().setInt("min.num.spills.for.combine", 0);
job.submit();
job.waitForCompletion(false);
if (job.isSuccessful()) {
Counters counters = job.getCounters();
long combinerInputRecords = counters.findCounter("org.apache.hadoop.mapreduce.TaskCounter", "COMBINE_INPUT_RECORDS").getValue();
long combinerOutputRecords = counters.findCounter("org.apache.hadoop.mapreduce.TaskCounter", "COMBINE_OUTPUT_RECORDS").getValue();
Assert.assertTrue(combinerInputRecords > 0);
Assert.assertTrue(combinerInputRecords > combinerOutputRecords);
BufferedReader br = new BufferedReader(new FileReader(new File(out, "part-r-00000")));
Set<String> output = new HashSet<String>();
String line = br.readLine();
Assert.assertNotNull(line);
output.add(line.substring(0, 1) + line.substring(4, 5));
line = br.readLine();
Assert.assertNotNull(line);
output.add(line.substring(0, 1) + line.substring(4, 5));
line = br.readLine();
Assert.assertNull(line);
br.close();
Set<String> expected = new HashSet<String>();
expected.add("A2");
expected.add("B5");
Assert.assertEquals(expected, output);
} else {
Assert.fail("Job failed");
}
}
use of org.apache.hadoop.mapred.JobConf in project hadoop by apache.
the class TestPipes method runProgram.
static void runProgram(MiniMRCluster mr, MiniDFSCluster dfs, Path program, Path inputPath, Path outputPath, int numMaps, int numReduces, String[] expectedResults, JobConf conf) throws IOException {
Path wordExec = new Path("testing/bin/application");
JobConf job = null;
if (conf == null) {
job = mr.createJobConf();
} else {
job = new JobConf(conf);
}
job.setNumMapTasks(numMaps);
job.setNumReduceTasks(numReduces);
{
FileSystem fs = dfs.getFileSystem();
fs.delete(wordExec.getParent(), true);
fs.copyFromLocalFile(program, wordExec);
Submitter.setExecutable(job, fs.makeQualified(wordExec).toString());
Submitter.setIsJavaRecordReader(job, true);
Submitter.setIsJavaRecordWriter(job, true);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
RunningJob rJob = null;
if (numReduces == 0) {
rJob = Submitter.jobSubmit(job);
while (!rJob.isComplete()) {
try {
Thread.sleep(1000);
} catch (InterruptedException ie) {
throw new RuntimeException(ie);
}
}
} else {
rJob = Submitter.runJob(job);
}
assertTrue("pipes job failed", rJob.isSuccessful());
Counters counters = rJob.getCounters();
Counters.Group wordCountCounters = counters.getGroup("WORDCOUNT");
int numCounters = 0;
for (Counter c : wordCountCounters) {
System.out.println(c);
++numCounters;
}
assertTrue("No counters found!", (numCounters > 0));
}
List<String> results = new ArrayList<String>();
for (Path p : FileUtil.stat2Paths(dfs.getFileSystem().listStatus(outputPath, new Utils.OutputFileUtils.OutputFilesFilter()))) {
results.add(MapReduceTestUtil.readOutput(p, job));
}
assertEquals("number of reduces is wrong", expectedResults.length, results.size());
for (int i = 0; i < results.size(); i++) {
assertEquals("pipes program " + program + " output " + i + " wrong", expectedResults[i], results.get(i));
}
}
Aggregations