Search in sources :

Example 71 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class WikipediaDocnoMappingBuilder method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file").create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg().withDescription("two-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }
    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);
    String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);
    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);
    Job job = Job.getInstance(getConf());
    job.setJarByClass(WikipediaDocnoMappingBuilder.class);
    job.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));
    job.getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }
    job.setNumReduceTasks(1);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(job, false);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);
    job.waitForCompletion(true);
    long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue() : job.getCounters().findCounter(PageTypes.ARTICLE).getValue();
    WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-r-00000", (int) cnt, outputFile);
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 72 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class TrecForwardIndexBuilder method run.

/**
   * Runs this tool.
   */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data").create(MAPPING_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    String tmpDir = "tmp-" + TrecForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000);
    Job job = new Job(getConf(), TrecForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
    job.setJarByClass(TrecForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(getConf());
    LOG.info("Tool name: " + TrecForwardIndexBuilder.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - DocnoMapping file: " + mappingFile);
    LOG.info(" - temp output directory: " + tmpDir);
    job.setNumReduceTasks(1);
    if (job.getConfiguration().get("mapred.job.tracker").equals("local")) {
        job.getConfiguration().set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());
    }
    FileInputFormat.setInputPaths(job, new Path(collectionPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);
    job.setInputFormatClass(TrecDocumentInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(MyMapper.class);
    // delete the output directory if it exists already
    FileSystem.get(getConf()).delete(new Path(tmpDir), true);
    job.waitForCompletion(true);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    String inputFile = tmpDir + "/" + "part-r-00000";
    LOG.info("Writing " + numDocs + " doc offseta to " + indexFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));
    FSDataOutputStream writer = fs.create(new Path(indexFile), true);
    writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName());
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);
        writer.writeLong(offset);
        writer.writeInt(len);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    LOG.info(cnt + " docs total. Done!");
    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }
    fs.delete(new Path(tmpDir), true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) URI(java.net.URI) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) Counters(org.apache.hadoop.mapreduce.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Job(org.apache.hadoop.mapreduce.Job)

Example 73 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class CountMedlineCitations method run.

@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path").create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data").create(MAPPING_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    LOG.info("Tool: " + CountMedlineCitations.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);
    Job job = new Job(getConf(), CountMedlineCitations.class.getSimpleName() + ":" + inputPath);
    job.setJarByClass(CountMedlineCitations.class);
    job.setNumReduceTasks(0);
    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    job.getConfiguration().set("DocnoMappingClass", MedlineDocnoMapping.class.getCanonicalName());
    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);
    job.setInputFormatClass(MedlineCitationInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setMapperClass(MyMapper.class);
    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);
    job.waitForCompletion(true);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    LOG.info("Read " + numDocs + " docs.");
    return numDocs;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) URI(java.net.URI) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) Counters(org.apache.hadoop.mapreduce.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 74 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class DocumentForwardIndexHttpServer method run.

@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) forward index path").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data path").create(MAPPING_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    LOG.info("Launching DocumentForwardIndexHttpServer");
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - docno mapping data file: " + mappingFile);
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Random rand = new Random();
    int r = rand.nextInt();
    // This tmp file as a rendezvous point.
    Path tmpPath = new Path("/tmp/" + r);
    if (fs.exists(tmpPath)) {
        fs.delete(tmpPath, true);
    }
    Job job = new Job(conf, DocumentForwardIndexHttpServer.class.getSimpleName());
    job.setJarByClass(DocumentForwardIndexHttpServer.class);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx1024m");
    job.getConfiguration().set(INDEX_KEY, indexFile);
    job.getConfiguration().set(DOCNO_MAPPING_KEY, mappingFile);
    job.getConfiguration().set(TMP_KEY, tmpPath.toString());
    job.setNumReduceTasks(0);
    job.setInputFormatClass(NullInputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.submit();
    LOG.info("Waiting for server to start up...");
    while (!fs.exists(tmpPath)) {
        Thread.sleep(50000);
        LOG.info("...");
    }
    FSDataInputStream in = fs.open(tmpPath);
    String host = in.readUTF();
    in.close();
    LOG.info("host: " + host);
    LOG.info("port: 8888");
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) GnuParser(org.apache.commons.cli.GnuParser) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 75 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class HadoopAlign method main.

@SuppressWarnings("static-access")
public static void main(String[] args) throws IOException {
    options = new Options();
    options.addOption(OptionBuilder.withDescription("path to XML-formatted parallel corpus").withArgName("path").hasArg().isRequired().create(INPUT_OPTION));
    options.addOption(OptionBuilder.withDescription("path to work/output directory on HDFS").withArgName("path").hasArg().isRequired().create(WORK_OPTION));
    options.addOption(OptionBuilder.withDescription("two-letter collection language code").withArgName("en|de|fr|zh|es|ar|tr").hasArg().isRequired().create(FLANG_OPTION));
    options.addOption(OptionBuilder.withDescription("two-letter collection language code").withArgName("en|de|fr|zh|es|ar|tr").hasArg().isRequired().create(ELANG_OPTION));
    options.addOption(OptionBuilder.withDescription("number of IBM Model 1 iterations").withArgName("positive integer").hasArg().create(MODEL1_OPTION));
    options.addOption(OptionBuilder.withDescription("number of HMM iterations").withArgName("positive integer").hasArg().create(HMM_OPTION));
    options.addOption(OptionBuilder.withDescription("truncate/stem text or not").create(TRUNCATE_OPTION));
    options.addOption(OptionBuilder.withDescription("number of reducers").withArgName("positive integer").hasArg().create(REDUCE_OPTION));
    options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars").withArgName("jar packages").hasArg().create(LIBJARS_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        printUsage();
        System.err.println("Error parsing command line: " + exp.getMessage());
        return;
    }
    String bitextPath = cmdline.getOptionValue(INPUT_OPTION);
    String workDir = cmdline.getOptionValue(WORK_OPTION);
    String srcLang = cmdline.getOptionValue(FLANG_OPTION);
    String trgLang = cmdline.getOptionValue(ELANG_OPTION);
    int model1Iters = cmdline.hasOption(MODEL1_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MODEL1_OPTION)) : 0;
    int hmmIters = cmdline.hasOption(HMM_OPTION) ? Integer.parseInt(cmdline.getOptionValue(HMM_OPTION)) : 0;
    if (model1Iters + hmmIters == 0) {
        System.err.println("Please enter a positive number of iterations for either Model 1 or HMM");
        printUsage();
        return;
    }
    boolean isTruncate = cmdline.hasOption(TRUNCATE_OPTION) ? true : false;
    int numReducers = cmdline.hasOption(REDUCE_OPTION) ? Integer.parseInt(cmdline.getOptionValue(REDUCE_OPTION)) : 50;
    HadoopAlignConfig hac = new HadoopAlignConfig(workDir, trgLang, srcLang, bitextPath, model1Iters, hmmIters, // use null word
    true, // use variational bayes
    false, // use word truncation
    isTruncate, // alpha
    0.00f);
    hac.setHMMHomogeneous(false);
    hac.set("mapreduce.map.memory.mb", "2048");
    hac.set("mapreduce.map.java.opts", "-Xmx2048m");
    hac.set("mapreduce.reduce.memory.mb", "2048");
    hac.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    hac.setHMMp0(0.2);
    hac.setMaxSentLen(15);
    doAlignment(50, numReducers, hac);
}
Also used : Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException)

Aggregations

CommandLineParser (org.apache.commons.cli.CommandLineParser)265 CommandLine (org.apache.commons.cli.CommandLine)246 Options (org.apache.commons.cli.Options)206 ParseException (org.apache.commons.cli.ParseException)186 GnuParser (org.apache.commons.cli.GnuParser)158 HelpFormatter (org.apache.commons.cli.HelpFormatter)111 PosixParser (org.apache.commons.cli.PosixParser)61 Option (org.apache.commons.cli.Option)52 IOException (java.io.IOException)48 Path (org.apache.hadoop.fs.Path)42 File (java.io.File)41 DefaultParser (org.apache.commons.cli.DefaultParser)29 Job (org.apache.hadoop.mapreduce.Job)27 Configuration (org.apache.hadoop.conf.Configuration)19 FileInputStream (java.io.FileInputStream)16 Properties (java.util.Properties)15 ArrayList (java.util.ArrayList)14 BasicParser (org.apache.commons.cli.BasicParser)14 FileSystem (org.apache.hadoop.fs.FileSystem)12 URI (java.net.URI)10