Search in sources :

Example 81 with JobConf

use of org.apache.hadoop.mapred.JobConf in project hadoop by apache.

the class TestMultipleOutputs method setUp.

@Before
public void setUp() throws Exception {
    super.setUp();
    Path rootDir = getDir(ROOT_DIR);
    Path inDir = getDir(IN_DIR);
    JobConf conf = createJobConf();
    FileSystem fs = FileSystem.get(conf);
    fs.delete(rootDir, true);
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Mkdirs failed to create " + inDir.toString());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException) JobConf(org.apache.hadoop.mapred.JobConf) Before(org.junit.Before)

Example 82 with JobConf

use of org.apache.hadoop.mapred.JobConf in project hadoop by apache.

the class TestMultipleOutputs method _testMOWithJavaSerialization.

protected void _testMOWithJavaSerialization(boolean withCounters) throws Exception {
    Path inDir = getDir(IN_DIR);
    Path outDir = getDir(OUT_DIR);
    JobConf conf = createJobConf();
    FileSystem fs = FileSystem.get(conf);
    DataOutputStream file = fs.create(new Path(inDir, "part-0"));
    file.writeBytes("a\nb\n\nc\nd\ne");
    file.close();
    fs.delete(inDir, true);
    fs.delete(outDir, true);
    file = fs.create(new Path(inDir, "part-1"));
    file.writeBytes("a\nb\n\nc\nd\ne");
    file.close();
    conf.setJobName("mo");
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInputFormat(TextInputFormat.class);
    conf.setMapOutputKeyClass(Long.class);
    conf.setMapOutputValueClass(String.class);
    conf.setOutputKeyComparatorClass(JavaSerializationComparator.class);
    conf.setOutputKeyClass(Long.class);
    conf.setOutputValueClass(String.class);
    conf.setOutputFormat(TextOutputFormat.class);
    MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class, Long.class, String.class);
    MultipleOutputs.setCountersEnabled(conf, withCounters);
    conf.setMapperClass(MOJavaSerDeMap.class);
    conf.setReducerClass(MOJavaSerDeReduce.class);
    FileInputFormat.setInputPaths(conf, inDir);
    FileOutputFormat.setOutputPath(conf, outDir);
    JobClient jc = new JobClient(conf);
    RunningJob job = jc.submitJob(conf);
    while (!job.isComplete()) {
        Thread.sleep(100);
    }
    // assert number of named output part files
    int namedOutputCount = 0;
    FileStatus[] statuses = fs.listStatus(outDir);
    for (FileStatus status : statuses) {
        if (status.getPath().getName().equals("text-m-00000") || status.getPath().getName().equals("text-r-00000")) {
            namedOutputCount++;
        }
    }
    assertEquals(2, namedOutputCount);
    // assert TextOutputFormat files correctness
    BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(new Path(FileOutputFormat.getOutputPath(conf), "text-r-00000"))));
    int count = 0;
    String line = reader.readLine();
    while (line != null) {
        assertTrue(line.endsWith("text"));
        line = reader.readLine();
        count++;
    }
    reader.close();
    assertFalse(count == 0);
    Counters.Group counters = job.getCounters().getGroup(MultipleOutputs.class.getName());
    if (!withCounters) {
        assertEquals(0, counters.size());
    } else {
        assertEquals(1, counters.size());
        assertEquals(2, counters.getCounter("text"));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) InputStreamReader(java.io.InputStreamReader) DataOutputStream(java.io.DataOutputStream) JobClient(org.apache.hadoop.mapred.JobClient) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) BufferedReader(java.io.BufferedReader) Counters(org.apache.hadoop.mapred.Counters) JobConf(org.apache.hadoop.mapred.JobConf)

Example 83 with JobConf

use of org.apache.hadoop.mapred.JobConf in project hadoop by apache.

the class TestMultipleOutputs method _testMultipleOutputs.

protected void _testMultipleOutputs(boolean withCounters) throws Exception {
    Path inDir = getDir(IN_DIR);
    Path outDir = getDir(OUT_DIR);
    JobConf conf = createJobConf();
    FileSystem fs = FileSystem.get(conf);
    DataOutputStream file = fs.create(new Path(inDir, "part-0"));
    file.writeBytes("a\nb\n\nc\nd\ne");
    file.close();
    file = fs.create(new Path(inDir, "part-1"));
    file.writeBytes("a\nb\n\nc\nd\ne");
    file.close();
    conf.setJobName("mo");
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(LongWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(TextOutputFormat.class);
    MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class, LongWritable.class, Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, "sequence", SequenceFileOutputFormat.class, LongWritable.class, Text.class);
    MultipleOutputs.setCountersEnabled(conf, withCounters);
    conf.setMapperClass(MOMap.class);
    conf.setReducerClass(MOReduce.class);
    FileInputFormat.setInputPaths(conf, inDir);
    FileOutputFormat.setOutputPath(conf, outDir);
    JobClient jc = new JobClient(conf);
    RunningJob job = jc.submitJob(conf);
    while (!job.isComplete()) {
        Thread.sleep(100);
    }
    // assert number of named output part files
    int namedOutputCount = 0;
    FileStatus[] statuses = fs.listStatus(outDir);
    for (FileStatus status : statuses) {
        if (status.getPath().getName().equals("text-m-00000") || status.getPath().getName().equals("text-m-00001") || status.getPath().getName().equals("text-r-00000") || status.getPath().getName().equals("sequence_A-m-00000") || status.getPath().getName().equals("sequence_A-m-00001") || status.getPath().getName().equals("sequence_B-m-00000") || status.getPath().getName().equals("sequence_B-m-00001") || status.getPath().getName().equals("sequence_B-r-00000") || status.getPath().getName().equals("sequence_C-r-00000")) {
            namedOutputCount++;
        }
    }
    assertEquals(9, namedOutputCount);
    // assert TextOutputFormat files correctness
    BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(new Path(FileOutputFormat.getOutputPath(conf), "text-r-00000"))));
    int count = 0;
    String line = reader.readLine();
    while (line != null) {
        assertTrue(line.endsWith("text"));
        line = reader.readLine();
        count++;
    }
    reader.close();
    assertFalse(count == 0);
    // assert SequenceOutputFormat files correctness
    SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, new Path(FileOutputFormat.getOutputPath(conf), "sequence_B-r-00000"), conf);
    assertEquals(LongWritable.class, seqReader.getKeyClass());
    assertEquals(Text.class, seqReader.getValueClass());
    count = 0;
    LongWritable key = new LongWritable();
    Text value = new Text();
    while (seqReader.next(key, value)) {
        assertEquals("sequence", value.toString());
        count++;
    }
    seqReader.close();
    assertFalse(count == 0);
    Counters.Group counters = job.getCounters().getGroup(MultipleOutputs.class.getName());
    if (!withCounters) {
        assertEquals(0, counters.size());
    } else {
        assertEquals(4, counters.size());
        assertEquals(4, counters.getCounter("text"));
        assertEquals(2, counters.getCounter("sequence_A"));
        assertEquals(4, counters.getCounter("sequence_B"));
        assertEquals(2, counters.getCounter("sequence_C"));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) InputStreamReader(java.io.InputStreamReader) DataOutputStream(java.io.DataOutputStream) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) Text(org.apache.hadoop.io.Text) JobClient(org.apache.hadoop.mapred.JobClient) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) BufferedReader(java.io.BufferedReader) Counters(org.apache.hadoop.mapred.Counters) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf)

Example 84 with JobConf

use of org.apache.hadoop.mapred.JobConf in project hadoop by apache.

the class TestNewCombinerGrouping method testCombiner.

@Test
public void testCombiner() throws Exception {
    if (!new File(TEST_ROOT_DIR).mkdirs()) {
        throw new RuntimeException("Could not create test dir: " + TEST_ROOT_DIR);
    }
    File in = new File(TEST_ROOT_DIR, "input");
    if (!in.mkdirs()) {
        throw new RuntimeException("Could not create test dir: " + in);
    }
    File out = new File(TEST_ROOT_DIR, "output");
    PrintWriter pw = new PrintWriter(new FileWriter(new File(in, "data.txt")));
    pw.println("A|a,1");
    pw.println("A|b,2");
    pw.println("B|a,3");
    pw.println("B|b,4");
    pw.println("B|c,5");
    pw.close();
    JobConf conf = new JobConf();
    conf.set("mapreduce.framework.name", "local");
    Job job = new Job(conf);
    TextInputFormat.setInputPaths(job, new Path(in.getPath()));
    TextOutputFormat.setOutputPath(job, new Path(out.getPath()));
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setGroupingComparatorClass(GroupComparator.class);
    job.setCombinerKeyGroupingComparatorClass(GroupComparator.class);
    job.setCombinerClass(Combiner.class);
    job.getConfiguration().setInt("min.num.spills.for.combine", 0);
    job.submit();
    job.waitForCompletion(false);
    if (job.isSuccessful()) {
        Counters counters = job.getCounters();
        long combinerInputRecords = counters.findCounter("org.apache.hadoop.mapreduce.TaskCounter", "COMBINE_INPUT_RECORDS").getValue();
        long combinerOutputRecords = counters.findCounter("org.apache.hadoop.mapreduce.TaskCounter", "COMBINE_OUTPUT_RECORDS").getValue();
        Assert.assertTrue(combinerInputRecords > 0);
        Assert.assertTrue(combinerInputRecords > combinerOutputRecords);
        BufferedReader br = new BufferedReader(new FileReader(new File(out, "part-r-00000")));
        Set<String> output = new HashSet<String>();
        String line = br.readLine();
        Assert.assertNotNull(line);
        output.add(line.substring(0, 1) + line.substring(4, 5));
        line = br.readLine();
        Assert.assertNotNull(line);
        output.add(line.substring(0, 1) + line.substring(4, 5));
        line = br.readLine();
        Assert.assertNull(line);
        br.close();
        Set<String> expected = new HashSet<String>();
        expected.add("A2");
        expected.add("B5");
        Assert.assertEquals(expected, output);
    } else {
        Assert.fail("Job failed");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileWriter(java.io.FileWriter) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) File(java.io.File) JobConf(org.apache.hadoop.mapred.JobConf) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 85 with JobConf

use of org.apache.hadoop.mapred.JobConf in project hadoop by apache.

the class TestPipes method runProgram.

static void runProgram(MiniMRCluster mr, MiniDFSCluster dfs, Path program, Path inputPath, Path outputPath, int numMaps, int numReduces, String[] expectedResults, JobConf conf) throws IOException {
    Path wordExec = new Path("testing/bin/application");
    JobConf job = null;
    if (conf == null) {
        job = mr.createJobConf();
    } else {
        job = new JobConf(conf);
    }
    job.setNumMapTasks(numMaps);
    job.setNumReduceTasks(numReduces);
    {
        FileSystem fs = dfs.getFileSystem();
        fs.delete(wordExec.getParent(), true);
        fs.copyFromLocalFile(program, wordExec);
        Submitter.setExecutable(job, fs.makeQualified(wordExec).toString());
        Submitter.setIsJavaRecordReader(job, true);
        Submitter.setIsJavaRecordWriter(job, true);
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);
        RunningJob rJob = null;
        if (numReduces == 0) {
            rJob = Submitter.jobSubmit(job);
            while (!rJob.isComplete()) {
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException ie) {
                    throw new RuntimeException(ie);
                }
            }
        } else {
            rJob = Submitter.runJob(job);
        }
        assertTrue("pipes job failed", rJob.isSuccessful());
        Counters counters = rJob.getCounters();
        Counters.Group wordCountCounters = counters.getGroup("WORDCOUNT");
        int numCounters = 0;
        for (Counter c : wordCountCounters) {
            System.out.println(c);
            ++numCounters;
        }
        assertTrue("No counters found!", (numCounters > 0));
    }
    List<String> results = new ArrayList<String>();
    for (Path p : FileUtil.stat2Paths(dfs.getFileSystem().listStatus(outputPath, new Utils.OutputFileUtils.OutputFilesFilter()))) {
        results.add(MapReduceTestUtil.readOutput(p, job));
    }
    assertEquals("number of reduces is wrong", expectedResults.length, results.size());
    for (int i = 0; i < results.size(); i++) {
        assertEquals("pipes program " + program + " output " + i + " wrong", expectedResults[i], results.get(i));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) Counter(org.apache.hadoop.mapred.Counters.Counter) Utils(org.apache.hadoop.mapred.Utils) StringUtils(org.apache.hadoop.util.StringUtils) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

JobConf (org.apache.hadoop.mapred.JobConf)1037 Path (org.apache.hadoop.fs.Path)510 Test (org.junit.Test)317 FileSystem (org.apache.hadoop.fs.FileSystem)264 IOException (java.io.IOException)204 Configuration (org.apache.hadoop.conf.Configuration)163 InputSplit (org.apache.hadoop.mapred.InputSplit)110 ArrayList (java.util.ArrayList)89 Text (org.apache.hadoop.io.Text)82 File (java.io.File)81 RunningJob (org.apache.hadoop.mapred.RunningJob)67 Properties (java.util.Properties)58 List (java.util.List)49 HashMap (java.util.HashMap)47 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)47 SequenceFile (org.apache.hadoop.io.SequenceFile)45 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)44 Map (java.util.Map)42 Job (org.apache.hadoop.mapreduce.Job)42 LongWritable (org.apache.hadoop.io.LongWritable)41