Search in sources :

Example 91 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.

the class ClueWarcForwardIndexBuilder method run.

/**
 * Runs this tool.
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path (must be block-compressed SequenceFiles)").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path").create(INDEX_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    JobConf conf = new JobConf(getConf(), ClueWarcForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);
    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);
    LOG.info("Tool name: " + ClueWarcForwardIndexBuilder.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
    Random random = new Random();
    Path outputPath = new Path("tmp-" + ClueWarcForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000));
    conf.setJobName(ClueWarcForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(1);
    // thinks its a MapFile.
    for (FileStatus status : fs.listStatus(new Path(collectionPath))) {
        FileInputFormat.addInputPath(conf, status.getPath());
    }
    FileOutputFormat.setOutputPath(conf, outputPath);
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);
    // delete the output directory if it exists already
    fs.delete(outputPath, true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();
    LOG.info("number of blocks: " + blocks);
    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);
    out.writeUTF(ClueWarcForwardIndex.class.getCanonicalName());
    out.writeUTF(collectionPath);
    out.writeInt(blocks);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");
        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);
        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }
    reader.close();
    out.close();
    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }
    fs.delete(outputPath, true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) FileStatus(org.apache.hadoop.fs.FileStatus) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 92 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.

the class CountClueWarcRecords method run.

/**
 * Runs this tool.
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(new Option(ORIGINAL_OPTION, "use original ClueWeb09 distribution"));
    options.addOption(new Option(REPACKED_OPTION, "use repacked SequenceFiles"));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("path: base path for 'original', actual path for 'repacked'").create(PATH_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("DocnoMapping data path").create(MAPPING_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("segment number (required if 'original')").create(SEGMENT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file to write the number of records").create(COUNT_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    boolean repacked;
    if (cmdline.hasOption(REPACKED_OPTION)) {
        repacked = true;
    } else if (cmdline.hasOption(ORIGINAL_OPTION)) {
        repacked = false;
    } else {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Expecting either -original or -repacked");
        return -1;
    }
    if (!cmdline.hasOption(PATH_OPTION) || !cmdline.hasOption(MAPPING_OPTION) || (!repacked && !cmdline.hasOption(SEGMENT_OPTION))) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String path = cmdline.getOptionValue(PATH_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    int segment = 1;
    if (!repacked) {
        segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT_OPTION));
    }
    LOG.info("Tool name: " + CountClueWarcRecords.class.getSimpleName());
    LOG.info(" - repacked: " + repacked);
    LOG.info(" - path: " + path);
    LOG.info(" - mapping file: " + mappingFile);
    if (!repacked) {
        LOG.info(" - segment number: " + segment);
    }
    FileSystem fs = FileSystem.get(getConf());
    int mapTasks = 10;
    JobConf conf = new JobConf(getConf(), CountClueWarcRecords.class);
    conf.setJobName(CountClueWarcRecords.class.getSimpleName() + (repacked ? ":" + path : ":segment" + segment));
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    if (repacked) {
        // thinks its a MapFile.
        for (FileStatus status : fs.listStatus(new Path(path))) {
            FileInputFormat.addInputPath(conf, status.getPath());
        }
    } else {
        ClueCollectionPathConstants.addEnglishCollectionPart(conf, path, segment);
    }
    DistributedCache.addCacheFile(new URI(mappingFile), conf);
    if (repacked) {
        conf.setInputFormat(SequenceFileInputFormat.class);
    } else {
        conf.setInputFormat(ClueWarcInputFormat.class);
    }
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(MyMapper.class);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();
    LOG.info("Read " + numDocs + " docs.");
    if (cmdline.hasOption(COUNT_OPTION)) {
        String f = cmdline.getOptionValue(COUNT_OPTION);
        FSDataOutputStream out = fs.create(new Path(f));
        out.write(new Integer(numDocs).toString().getBytes());
        out.close();
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) FileStatus(org.apache.hadoop.fs.FileStatus) GnuParser(org.apache.commons.cli.GnuParser) URI(java.net.URI) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) Option(org.apache.commons.cli.Option) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 93 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.

the class M1ViterbiExtract method main.

@SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException {
    int mapTasks = 15;
    JobConf conf = new JobConf(M1ViterbiMapper.class);
    conf.setJobName("m1viterbi");
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(M1ViterbiMapper.class);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInputFormat(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(bitext));
    FileOutputFormat.setOutputPath(conf, new Path("somealigns.test"));
    RunningJob rj = JobClient.runJob(conf);
    Counters cs = rj.getCounters();
    double lp = (double) cs.getCounter(CrossEntropyCounters.LOGPROB);
    double wc = (double) cs.getCounter(CrossEntropyCounters.WORDCOUNT);
    double ce = (lp / wc) / Math.log(2.0);
    System.out.println("Viterbi cross-entropy: " + ce + "   perplexity: " + Math.pow(2.0, ce));
}
Also used : Path(org.apache.hadoop.fs.Path) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

RunningJob (org.apache.hadoop.mapred.RunningJob)93 JobConf (org.apache.hadoop.mapred.JobConf)65 Path (org.apache.hadoop.fs.Path)49 JobClient (org.apache.hadoop.mapred.JobClient)33 IOException (java.io.IOException)28 FileSystem (org.apache.hadoop.fs.FileSystem)28 DMLConfig (org.apache.sysml.conf.DMLConfig)27 Group (org.apache.hadoop.mapred.Counters.Group)26 Counters (org.apache.hadoop.mapred.Counters)17 Configuration (org.apache.hadoop.conf.Configuration)14 MatrixChar_N_ReducerGroups (org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups)13 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)10 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)8 File (java.io.File)6 TaggedMatrixBlock (org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock)6 DataOutputStream (java.io.DataOutputStream)5 URI (java.net.URI)5 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)5 Context (org.apache.hadoop.hive.ql.Context)5 Text (org.apache.hadoop.io.Text)5