Search in sources :

Example 16 with Counters

use of org.apache.hadoop.mapred.Counters in project Cloud9 by lintool.

the class WikipediaForwardIndexBuilder method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg().withDescription("two-letter language code").create(LANGUAGE_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);
    String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);
    if (!inputPath.isAbsolute()) {
        System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
        return -1;
    }
    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }
    JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);
    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - language: " + language);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
    conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));
    conf.setNumReduceTasks(1);
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);
    if (language != null) {
        conf.set("wiki.language", language);
    }
    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);
    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpPath), true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int blocks = (int) counters.getCounter(Blocks.Total);
    LOG.info("number of blocks: " + blocks);
    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);
    out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");
        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);
        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }
    reader.close();
    out.close();
    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }
    // Clean up.
    fs.delete(new Path(tmpPath), true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 17 with Counters

use of org.apache.hadoop.mapred.Counters in project hadoop by apache.

the class TestStreamingBadRecords method validateOutput.

private void validateOutput(RunningJob runningJob, boolean validateCount) throws Exception {
    LOG.info(runningJob.getCounters().toString());
    assertTrue(runningJob.isSuccessful());
    if (validateCount) {
        //validate counters
        String counterGrp = "org.apache.hadoop.mapred.Task$Counter";
        Counters counters = runningJob.getCounters();
        assertEquals(counters.findCounter(counterGrp, "MAP_SKIPPED_RECORDS").getCounter(), MAPPER_BAD_RECORDS.size());
        int mapRecs = INPUTSIZE - MAPPER_BAD_RECORDS.size();
        assertEquals(counters.findCounter(counterGrp, "MAP_INPUT_RECORDS").getCounter(), mapRecs);
        assertEquals(counters.findCounter(counterGrp, "MAP_OUTPUT_RECORDS").getCounter(), mapRecs);
        int redRecs = mapRecs - REDUCER_BAD_RECORDS.size();
        assertEquals(counters.findCounter(counterGrp, "REDUCE_SKIPPED_RECORDS").getCounter(), REDUCER_BAD_RECORDS.size());
        assertEquals(counters.findCounter(counterGrp, "REDUCE_SKIPPED_GROUPS").getCounter(), REDUCER_BAD_RECORDS.size());
        assertEquals(counters.findCounter(counterGrp, "REDUCE_INPUT_GROUPS").getCounter(), redRecs);
        assertEquals(counters.findCounter(counterGrp, "REDUCE_INPUT_RECORDS").getCounter(), redRecs);
        assertEquals(counters.findCounter(counterGrp, "REDUCE_OUTPUT_RECORDS").getCounter(), redRecs);
    }
    List<String> badRecs = new ArrayList<String>();
    badRecs.addAll(MAPPER_BAD_RECORDS);
    badRecs.addAll(REDUCER_BAD_RECORDS);
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(getOutputDir(), new Utils.OutputFileUtils.OutputFilesFilter()));
    if (outputFiles.length > 0) {
        InputStream is = getFileSystem().open(outputFiles[0]);
        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
        String line = reader.readLine();
        int counter = 0;
        while (line != null) {
            counter++;
            StringTokenizer tokeniz = new StringTokenizer(line, "\t");
            String value = tokeniz.nextToken();
            int index = value.indexOf("hey");
            assertTrue(index > -1);
            if (index > -1) {
                String heyStr = value.substring(index);
                assertTrue(!badRecs.contains(heyStr));
            }
            line = reader.readLine();
        }
        reader.close();
        if (validateCount) {
            assertEquals(INPUTSIZE - badRecs.size(), counter);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) StringTokenizer(java.util.StringTokenizer) Utils(org.apache.hadoop.mapred.Utils) BufferedReader(java.io.BufferedReader) Counters(org.apache.hadoop.mapred.Counters)

Example 18 with Counters

use of org.apache.hadoop.mapred.Counters in project hadoop by apache.

the class TestStreamingCombiner method testCommandLine.

@Test
public void testCommandLine() throws Exception {
    super.testCommandLine();
    // validate combiner counters
    String counterGrp = "org.apache.hadoop.mapred.Task$Counter";
    Counters counters = job.running_.getCounters();
    assertTrue(counters.findCounter(counterGrp, "COMBINE_INPUT_RECORDS").getValue() != 0);
    assertTrue(counters.findCounter(counterGrp, "COMBINE_OUTPUT_RECORDS").getValue() != 0);
}
Also used : Counters(org.apache.hadoop.mapred.Counters) Test(org.junit.Test)

Example 19 with Counters

use of org.apache.hadoop.mapred.Counters in project hadoop by apache.

the class TestStreamingCounters method validateCounters.

private void validateCounters() throws IOException {
    Counters counters = job.running_.getCounters();
    assertNotNull("Counters", counters);
    Group group = counters.getGroup("UserCounters");
    assertNotNull("Group", group);
    Counter counter = group.getCounterForName("InputLines");
    assertNotNull("Counter", counter);
    assertEquals(3, counter.getCounter());
}
Also used : Group(org.apache.hadoop.mapred.Counters.Group) Counter(org.apache.hadoop.mapred.Counters.Counter) Counters(org.apache.hadoop.mapred.Counters)

Example 20 with Counters

use of org.apache.hadoop.mapred.Counters in project Cloud9 by lintool.

the class M1ViterbiExtract method main.

@SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException {
    int mapTasks = 15;
    JobConf conf = new JobConf(M1ViterbiMapper.class);
    conf.setJobName("m1viterbi");
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(M1ViterbiMapper.class);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInputFormat(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(bitext));
    FileOutputFormat.setOutputPath(conf, new Path("somealigns.test"));
    RunningJob rj = JobClient.runJob(conf);
    Counters cs = rj.getCounters();
    double lp = (double) cs.getCounter(CrossEntropyCounters.LOGPROB);
    double wc = (double) cs.getCounter(CrossEntropyCounters.WORDCOUNT);
    double ce = (lp / wc) / Math.log(2.0);
    System.out.println("Viterbi cross-entropy: " + ce + "   perplexity: " + Math.pow(2.0, ce));
}
Also used : Path(org.apache.hadoop.fs.Path) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

Counters (org.apache.hadoop.mapred.Counters)23 RunningJob (org.apache.hadoop.mapred.RunningJob)14 Path (org.apache.hadoop.fs.Path)13 JobConf (org.apache.hadoop.mapred.JobConf)12 FileSystem (org.apache.hadoop.fs.FileSystem)11 Counter (org.apache.hadoop.mapred.Counters.Counter)6 JobClient (org.apache.hadoop.mapred.JobClient)6 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)5 Text (org.apache.hadoop.io.Text)5 FileStatus (org.apache.hadoop.fs.FileStatus)4 LineReader (org.apache.hadoop.util.LineReader)4 BufferedReader (java.io.BufferedReader)3 DataOutputStream (java.io.DataOutputStream)3 IOException (java.io.IOException)3 InputStreamReader (java.io.InputStreamReader)3 ArrayList (java.util.ArrayList)3 CommandLine (org.apache.commons.cli.CommandLine)3 CommandLineParser (org.apache.commons.cli.CommandLineParser)3 GnuParser (org.apache.commons.cli.GnuParser)3 HelpFormatter (org.apache.commons.cli.HelpFormatter)3