Search in sources :

Example 6 with JobClient

use of org.apache.hadoop.mapred.JobClient in project hadoop by apache.

the class RandomTextWriter method run.

/**
   * This is the main routine for launching a distributed random write job.
   * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
   * The reduce doesn't do anything.
   * 
   * @throws IOException 
   */
public int run(String[] args) throws Exception {
    if (args.length == 0) {
        return printUsage();
    }
    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
    long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
        System.err.println("Cannot have " + BYTES_PER_MAP + " set to 0");
        return -2;
    }
    long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
        numMaps = 1;
        conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
    }
    conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
    Job job = Job.getInstance(conf);
    job.setJarByClass(RandomTextWriter.class);
    job.setJobName("random-text-writer");
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(RandomWriter.RandomInputFormat.class);
    job.setMapperClass(RandomTextMapper.class);
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            // exits
            return printUsage();
        }
    }
    job.setOutputFormatClass(outputFormatClass);
    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
    System.out.println("Running " + numMaps + " maps.");
    // reducer NONE
    job.setNumReduceTasks(0);
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return ret;
}
Also used : SequenceFileOutputFormat(org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat) Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) SequenceFileOutputFormat(org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat) JobClient(org.apache.hadoop.mapred.JobClient) Date(java.util.Date) ClusterStatus(org.apache.hadoop.mapred.ClusterStatus)

Example 7 with JobClient

use of org.apache.hadoop.mapred.JobClient in project hadoop by apache.

the class RandomWriter method run.

/**
   * This is the main routine for launching a distributed random write job.
   * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
   * The reduce doesn't do anything.
   * 
   * @throws IOException 
   */
public int run(String[] args) throws Exception {
    if (args.length == 0) {
        System.out.println("Usage: writer <out-dir>");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }
    Path outDir = new Path(args[0]);
    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10);
    long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
        System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0");
        return -2;
    }
    long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
        numMaps = 1;
        conf.setLong(BYTES_PER_MAP, totalBytesToWrite);
    }
    conf.setInt(MRJobConfig.NUM_MAPS, numMaps);
    Job job = Job.getInstance(conf);
    job.setJarByClass(RandomWriter.class);
    job.setJobName("random-writer");
    FileOutputFormat.setOutputPath(job, outDir);
    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setInputFormatClass(RandomInputFormat.class);
    job.setMapperClass(RandomMapper.class);
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    System.out.println("Running " + numMaps + " maps.");
    // reducer NONE
    job.setNumReduceTasks(0);
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) JobClient(org.apache.hadoop.mapred.JobClient) ClusterStatus(org.apache.hadoop.mapred.ClusterStatus) Date(java.util.Date)

Example 8 with JobClient

use of org.apache.hadoop.mapred.JobClient in project hadoop by apache.

the class TestChainMapReduce method testChain.

@Test
public void testChain() throws Exception {
    Path inDir = new Path("testing/chain/input");
    Path outDir = new Path("testing/chain/output");
    // Hack for local FS that does not have the concept of a 'mounting point'
    if (isLocalFS()) {
        String localPathRoot = System.getProperty("test.build.data", "/tmp").replace(' ', '+');
        inDir = new Path(localPathRoot, inDir);
        outDir = new Path(localPathRoot, outDir);
    }
    JobConf conf = createJobConf();
    conf.setBoolean("localFS", isLocalFS());
    conf.setInt("mapreduce.job.maps", 1);
    cleanFlags(conf);
    FileSystem fs = FileSystem.get(conf);
    fs.delete(outDir, true);
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Mkdirs failed to create " + inDir.toString());
    }
    DataOutputStream file = fs.create(new Path(inDir, "part-0"));
    file.writeBytes("1\n2\n");
    file.close();
    conf.setJobName("chain");
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.set("a", "X");
    JobConf mapAConf = new JobConf(false);
    mapAConf.set("a", "A");
    ChainMapper.addMapper(conf, AMap.class, LongWritable.class, Text.class, LongWritable.class, Text.class, true, mapAConf);
    ChainMapper.addMapper(conf, BMap.class, LongWritable.class, Text.class, LongWritable.class, Text.class, false, null);
    JobConf reduceConf = new JobConf(false);
    reduceConf.set("a", "C");
    ChainReducer.setReducer(conf, CReduce.class, LongWritable.class, Text.class, LongWritable.class, Text.class, true, reduceConf);
    ChainReducer.addMapper(conf, DMap.class, LongWritable.class, Text.class, LongWritable.class, Text.class, false, null);
    JobConf mapEConf = new JobConf(false);
    mapEConf.set("a", "E");
    ChainReducer.addMapper(conf, EMap.class, LongWritable.class, Text.class, LongWritable.class, Text.class, true, mapEConf);
    FileInputFormat.setInputPaths(conf, inDir);
    FileOutputFormat.setOutputPath(conf, outDir);
    JobClient jc = new JobClient(conf);
    RunningJob job = jc.submitJob(conf);
    while (!job.isComplete()) {
        Thread.sleep(100);
    }
    assertTrue(getFlag(conf, "configure.A"));
    assertTrue(getFlag(conf, "configure.B"));
    assertTrue(getFlag(conf, "configure.C"));
    assertTrue(getFlag(conf, "configure.D"));
    assertTrue(getFlag(conf, "configure.E"));
    assertTrue(getFlag(conf, "map.A.value.1"));
    assertTrue(getFlag(conf, "map.A.value.2"));
    assertTrue(getFlag(conf, "map.B.value.1"));
    assertTrue(getFlag(conf, "map.B.value.2"));
    assertTrue(getFlag(conf, "reduce.C.value.2"));
    assertTrue(getFlag(conf, "reduce.C.value.1"));
    assertTrue(getFlag(conf, "map.D.value.1"));
    assertTrue(getFlag(conf, "map.D.value.2"));
    assertTrue(getFlag(conf, "map.E.value.1"));
    assertTrue(getFlag(conf, "map.E.value.2"));
    assertTrue(getFlag(conf, "close.A"));
    assertTrue(getFlag(conf, "close.B"));
    assertTrue(getFlag(conf, "close.C"));
    assertTrue(getFlag(conf, "close.D"));
    assertTrue(getFlag(conf, "close.E"));
}
Also used : Path(org.apache.hadoop.fs.Path) DataOutputStream(java.io.DataOutputStream) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) IOException(java.io.IOException) JobConf(org.apache.hadoop.mapred.JobConf) JobClient(org.apache.hadoop.mapred.JobClient) Test(org.junit.Test)

Example 9 with JobClient

use of org.apache.hadoop.mapred.JobClient in project hadoop by apache.

the class TestKeyFieldBasedComparator method configure.

public void configure(String keySpec, int expect) throws Exception {
    Path testdir = new Path(TEST_DIR.getAbsolutePath());
    Path inDir = new Path(testdir, "in");
    Path outDir = new Path(testdir, "out");
    FileSystem fs = getFileSystem();
    fs.delete(testdir, true);
    conf.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(conf, inDir);
    FileOutputFormat.setOutputPath(conf, outDir);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputKeyComparatorClass(KeyFieldBasedComparator.class);
    conf.setKeyFieldComparatorOptions(keySpec);
    conf.setKeyFieldPartitionerOptions("-k1.1,1.1");
    conf.set(JobContext.MAP_OUTPUT_KEY_FIELD_SEPERATOR, " ");
    conf.setMapperClass(InverseMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    if (!fs.mkdirs(testdir)) {
        throw new IOException("Mkdirs failed to create " + testdir.toString());
    }
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Mkdirs failed to create " + inDir.toString());
    }
    // set up input data in 2 files 
    Path inFile = new Path(inDir, "part0");
    FileOutputStream fos = new FileOutputStream(inFile.toString());
    fos.write((line1 + "\n").getBytes());
    fos.write((line2 + "\n").getBytes());
    fos.close();
    JobClient jc = new JobClient(conf);
    RunningJob r_job = jc.submitJob(conf);
    while (!r_job.isComplete()) {
        Thread.sleep(1000);
    }
    if (!r_job.isSuccessful()) {
        fail("Oops! The job broke due to an unexpected error");
    }
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(outDir, new Utils.OutputFileUtils.OutputFilesFilter()));
    if (outputFiles.length > 0) {
        InputStream is = getFileSystem().open(outputFiles[0]);
        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
        String line = reader.readLine();
        //that we have two lines
        if (expect == 1) {
            assertTrue(line.startsWith(line1));
        } else if (expect == 2) {
            assertTrue(line.startsWith(line2));
        }
        line = reader.readLine();
        if (expect == 1) {
            assertTrue(line.startsWith(line2));
        } else if (expect == 2) {
            assertTrue(line.startsWith(line1));
        }
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) IOException(java.io.IOException) JobClient(org.apache.hadoop.mapred.JobClient) Utils(org.apache.hadoop.mapred.Utils) FileSystem(org.apache.hadoop.fs.FileSystem) FileOutputStream(java.io.FileOutputStream) RunningJob(org.apache.hadoop.mapred.RunningJob) BufferedReader(java.io.BufferedReader)

Example 10 with JobClient

use of org.apache.hadoop.mapred.JobClient in project hadoop by apache.

the class TestMultipleOutputs method _testMOWithJavaSerialization.

protected void _testMOWithJavaSerialization(boolean withCounters) throws Exception {
    Path inDir = getDir(IN_DIR);
    Path outDir = getDir(OUT_DIR);
    JobConf conf = createJobConf();
    FileSystem fs = FileSystem.get(conf);
    DataOutputStream file = fs.create(new Path(inDir, "part-0"));
    file.writeBytes("a\nb\n\nc\nd\ne");
    file.close();
    fs.delete(inDir, true);
    fs.delete(outDir, true);
    file = fs.create(new Path(inDir, "part-1"));
    file.writeBytes("a\nb\n\nc\nd\ne");
    file.close();
    conf.setJobName("mo");
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInputFormat(TextInputFormat.class);
    conf.setMapOutputKeyClass(Long.class);
    conf.setMapOutputValueClass(String.class);
    conf.setOutputKeyComparatorClass(JavaSerializationComparator.class);
    conf.setOutputKeyClass(Long.class);
    conf.setOutputValueClass(String.class);
    conf.setOutputFormat(TextOutputFormat.class);
    MultipleOutputs.addNamedOutput(conf, "text", TextOutputFormat.class, Long.class, String.class);
    MultipleOutputs.setCountersEnabled(conf, withCounters);
    conf.setMapperClass(MOJavaSerDeMap.class);
    conf.setReducerClass(MOJavaSerDeReduce.class);
    FileInputFormat.setInputPaths(conf, inDir);
    FileOutputFormat.setOutputPath(conf, outDir);
    JobClient jc = new JobClient(conf);
    RunningJob job = jc.submitJob(conf);
    while (!job.isComplete()) {
        Thread.sleep(100);
    }
    // assert number of named output part files
    int namedOutputCount = 0;
    FileStatus[] statuses = fs.listStatus(outDir);
    for (FileStatus status : statuses) {
        if (status.getPath().getName().equals("text-m-00000") || status.getPath().getName().equals("text-r-00000")) {
            namedOutputCount++;
        }
    }
    assertEquals(2, namedOutputCount);
    // assert TextOutputFormat files correctness
    BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(new Path(FileOutputFormat.getOutputPath(conf), "text-r-00000"))));
    int count = 0;
    String line = reader.readLine();
    while (line != null) {
        assertTrue(line.endsWith("text"));
        line = reader.readLine();
        count++;
    }
    reader.close();
    assertFalse(count == 0);
    Counters.Group counters = job.getCounters().getGroup(MultipleOutputs.class.getName());
    if (!withCounters) {
        assertEquals(0, counters.size());
    } else {
        assertEquals(1, counters.size());
        assertEquals(2, counters.getCounter("text"));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) InputStreamReader(java.io.InputStreamReader) DataOutputStream(java.io.DataOutputStream) JobClient(org.apache.hadoop.mapred.JobClient) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) BufferedReader(java.io.BufferedReader) Counters(org.apache.hadoop.mapred.Counters) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

JobClient (org.apache.hadoop.mapred.JobClient)47 Path (org.apache.hadoop.fs.Path)25 RunningJob (org.apache.hadoop.mapred.RunningJob)20 FileSystem (org.apache.hadoop.fs.FileSystem)18 JobConf (org.apache.hadoop.mapred.JobConf)18 IOException (java.io.IOException)16 Configuration (org.apache.hadoop.conf.Configuration)16 ClusterStatus (org.apache.hadoop.mapred.ClusterStatus)11 Date (java.util.Date)7 Text (org.apache.hadoop.io.Text)6 Counters (org.apache.hadoop.mapred.Counters)6 Test (org.junit.Test)6 DataOutputStream (java.io.DataOutputStream)5 FileStatus (org.apache.hadoop.fs.FileStatus)5 BufferedReader (java.io.BufferedReader)4 InputStreamReader (java.io.InputStreamReader)4 CompilationOpContext (org.apache.hadoop.hive.ql.CompilationOpContext)4 Context (org.apache.hadoop.hive.ql.Context)4 DriverContext (org.apache.hadoop.hive.ql.DriverContext)4 FileOutputFormat (org.apache.hadoop.mapreduce.lib.output.FileOutputFormat)4