Search in sources :

Example 66 with HelpFormatter

use of org.apache.commons.cli.HelpFormatter in project Cloud9 by lintool.

the class TrecForwardIndexBuilder method run.

/**
   * Runs this tool.
   */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data").create(MAPPING_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    String tmpDir = "tmp-" + TrecForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000);
    Job job = new Job(getConf(), TrecForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
    job.setJarByClass(TrecForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(getConf());
    LOG.info("Tool name: " + TrecForwardIndexBuilder.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - DocnoMapping file: " + mappingFile);
    LOG.info(" - temp output directory: " + tmpDir);
    job.setNumReduceTasks(1);
    if (job.getConfiguration().get("mapred.job.tracker").equals("local")) {
        job.getConfiguration().set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());
    }
    FileInputFormat.setInputPaths(job, new Path(collectionPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);
    job.setInputFormatClass(TrecDocumentInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(MyMapper.class);
    // delete the output directory if it exists already
    FileSystem.get(getConf()).delete(new Path(tmpDir), true);
    job.waitForCompletion(true);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    String inputFile = tmpDir + "/" + "part-r-00000";
    LOG.info("Writing " + numDocs + " doc offseta to " + indexFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));
    FSDataOutputStream writer = fs.create(new Path(indexFile), true);
    writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName());
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);
        writer.writeLong(offset);
        writer.writeInt(len);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    LOG.info(cnt + " docs total. Done!");
    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }
    fs.delete(new Path(tmpDir), true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) URI(java.net.URI) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) Counters(org.apache.hadoop.mapreduce.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Job(org.apache.hadoop.mapreduce.Job)

Example 67 with HelpFormatter

use of org.apache.commons.cli.HelpFormatter in project Cloud9 by lintool.

the class CountMedlineCitations method run.

@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path").create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data").create(MAPPING_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    LOG.info("Tool: " + CountMedlineCitations.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);
    Job job = new Job(getConf(), CountMedlineCitations.class.getSimpleName() + ":" + inputPath);
    job.setJarByClass(CountMedlineCitations.class);
    job.setNumReduceTasks(0);
    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    job.getConfiguration().set("DocnoMappingClass", MedlineDocnoMapping.class.getCanonicalName());
    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);
    job.setInputFormatClass(MedlineCitationInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setMapperClass(MyMapper.class);
    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);
    job.waitForCompletion(true);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    LOG.info("Read " + numDocs + " docs.");
    return numDocs;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) URI(java.net.URI) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) Counters(org.apache.hadoop.mapreduce.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 68 with HelpFormatter

use of org.apache.commons.cli.HelpFormatter in project Cloud9 by lintool.

the class DocumentForwardIndexHttpServer method run.

@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) forward index path").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data path").create(MAPPING_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    LOG.info("Launching DocumentForwardIndexHttpServer");
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - docno mapping data file: " + mappingFile);
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Random rand = new Random();
    int r = rand.nextInt();
    // This tmp file as a rendezvous point.
    Path tmpPath = new Path("/tmp/" + r);
    if (fs.exists(tmpPath)) {
        fs.delete(tmpPath, true);
    }
    Job job = new Job(conf, DocumentForwardIndexHttpServer.class.getSimpleName());
    job.setJarByClass(DocumentForwardIndexHttpServer.class);
    job.getConfiguration().set("mapred.child.java.opts", "-Xmx1024m");
    job.getConfiguration().set(INDEX_KEY, indexFile);
    job.getConfiguration().set(DOCNO_MAPPING_KEY, mappingFile);
    job.getConfiguration().set(TMP_KEY, tmpPath.toString());
    job.setNumReduceTasks(0);
    job.setInputFormatClass(NullInputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.submit();
    LOG.info("Waiting for server to start up...");
    while (!fs.exists(tmpPath)) {
        Thread.sleep(50000);
        LOG.info("...");
    }
    FSDataInputStream in = fs.open(tmpPath);
    String host = in.readUTF();
    in.close();
    LOG.info("host: " + host);
    LOG.info("port: 8888");
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) GnuParser(org.apache.commons.cli.GnuParser) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 69 with HelpFormatter

use of org.apache.commons.cli.HelpFormatter in project Cloud9 by lintool.

the class HadoopAlign method printUsage.

private static void printUsage() {
    HelpFormatter formatter = new HelpFormatter();
    formatter.printHelp(HadoopAlign.class.getCanonicalName(), options);
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter)

Example 70 with HelpFormatter

use of org.apache.commons.cli.HelpFormatter in project Cloud9 by lintool.

the class FileMerger method run.

@Override
public /**
   * TODO: add in hadoop configuration
   */
int run(String[] args) throws IOException {
    Options options = new Options();
    options.addOption(HELP_OPTION, false, "print the help message");
    options.addOption(OptionBuilder.withArgName(PATH_INDICATOR).hasArg().withDescription("input file or directory").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName(PATH_INDICATOR).hasArg().withDescription("output file").create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName(INTEGER_INDICATOR).hasArg().withDescription("number of mappers (default to 0 and hence local merge mode, set to positive value to enable cluster merge mode)").create(MAPPER_OPTION));
    options.addOption(OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator().withDescription("assign value for given property").create("D"));
    options.addOption(TEXT_FILE_INPUT_FORMAT, false, "input file in sequence format");
    options.addOption(DELETE_SOURCE_OPTION, false, "delete sources after merging");
    int mapperTasks = 0;
    boolean deleteSource = DELETE_SOURCE;
    boolean textFileFormat = TEXT_FILE_INPUT;
    String inputPath = "";
    String outputPath = "";
    GenericOptionsParser genericOptionsParser = new GenericOptionsParser(args);
    Configuration configuration = genericOptionsParser.getConfiguration();
    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    try {
        CommandLine line = parser.parse(options, args);
        if (line.hasOption(HELP_OPTION)) {
            formatter.printHelp(FileMerger.class.getName(), options);
            System.exit(0);
        }
        if (line.hasOption(INPUT_OPTION)) {
            inputPath = line.getOptionValue(INPUT_OPTION);
        } else {
            throw new ParseException("Parsing failed due to " + INPUT_OPTION + " not initialized...");
        }
        if (line.hasOption(OUTPUT_OPTION)) {
            outputPath = line.getOptionValue(OUTPUT_OPTION);
        } else {
            throw new ParseException("Parsing failed due to " + OUTPUT_OPTION + " not initialized...");
        }
        if (line.hasOption(MAPPER_OPTION)) {
            mapperTasks = Integer.parseInt(line.getOptionValue(MAPPER_OPTION));
            if (mapperTasks <= 0) {
                sLogger.info("Warning: " + MAPPER_OPTION + " is not positive, merge in local model...");
                mapperTasks = 0;
            }
        }
        if (line.hasOption(DELETE_SOURCE_OPTION)) {
            deleteSource = true;
        }
        if (line.hasOption(TEXT_FILE_INPUT_FORMAT)) {
            textFileFormat = true;
        }
    } catch (ParseException pe) {
        System.err.println(pe.getMessage());
        formatter.printHelp(FileMerger.class.getName(), options);
        System.exit(0);
    } catch (NumberFormatException nfe) {
        System.err.println(nfe.getMessage());
        System.exit(0);
    }
    try {
        merge(configuration, inputPath, outputPath, mapperTasks, textFileFormat, deleteSource);
    } catch (InstantiationException ie) {
        ie.printStackTrace();
    } catch (IllegalAccessException iae) {
        iae.printStackTrace();
    }
    return 0;
}
Also used : Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) GnuParser(org.apache.commons.cli.GnuParser) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Aggregations

HelpFormatter (org.apache.commons.cli.HelpFormatter)273 Options (org.apache.commons.cli.Options)136 CommandLine (org.apache.commons.cli.CommandLine)126 CommandLineParser (org.apache.commons.cli.CommandLineParser)110 ParseException (org.apache.commons.cli.ParseException)103 GnuParser (org.apache.commons.cli.GnuParser)92 Path (org.apache.hadoop.fs.Path)42 PrintWriter (java.io.PrintWriter)35 Option (org.apache.commons.cli.Option)29 Job (org.apache.hadoop.mapreduce.Job)27 Configuration (org.apache.hadoop.conf.Configuration)21 File (java.io.File)17 IOException (java.io.IOException)14 DefaultParser (org.apache.commons.cli.DefaultParser)13 PosixParser (org.apache.commons.cli.PosixParser)12 FileSystem (org.apache.hadoop.fs.FileSystem)12 BasicParser (org.apache.commons.cli.BasicParser)11 ArrayList (java.util.ArrayList)8 URI (java.net.URI)6 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6