Search in sources :

Example 61 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class ExtractWikipediaLinkGraph method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output for edges").create(EDGES_OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output for adjacency list").create(ADJ_OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of partitions").create(NUM_PARTITIONS_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(EDGES_OUTPUT_OPTION) || !cmdline.hasOption(ADJ_OUTPUT_OPTION) || !cmdline.hasOption(NUM_PARTITIONS_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    int numPartitions = Integer.parseInt(cmdline.getOptionValue(NUM_PARTITIONS_OPTION));
    task1(cmdline.getOptionValue(INPUT_OPTION), cmdline.getOptionValue(EDGES_OUTPUT_OPTION), numPartitions);
    task2(cmdline.getOptionValue(EDGES_OUTPUT_OPTION), cmdline.getOptionValue(ADJ_OUTPUT_OPTION), numPartitions);
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException)

Example 62 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class EncodeBfsGraph method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("nodeid").hasArg().withDescription("source node").create(SRC_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(SRC_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    int src = Integer.parseInt(cmdline.getOptionValue(SRC_OPTION));
    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - src: " + src);
    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("EncodeBfsGraph[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, SRC_OPTION, src));
    job.setJarByClass(EncodeBfsGraph.class);
    job.setNumReduceTasks(0);
    job.getConfiguration().setInt(SRC_OPTION, src);
    job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);
    job.setMapperClass(MyMapper.class);
    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);
    job.waitForCompletion(true);
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 63 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class FindReachableNodes method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("FindReachableNodes:[%s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath));
    job.setJarByClass(FindReachableNodes.class);
    job.setNumReduceTasks(0);
    job.getConfiguration().setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);
    job.setMapperClass(MyMapper.class);
    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);
    job.waitForCompletion(true);
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 64 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class IterateBfs method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of partitions").create(NUM_PARTITIONS_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(NUM_PARTITIONS_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    int n = Integer.parseInt(cmdline.getOptionValue(NUM_PARTITIONS_OPTION));
    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - numPartitions: " + n);
    getConf().set("mapred.child.java.opts", "-Xmx2048m");
    Job job = Job.getInstance(getConf());
    job.setJobName(String.format("IterateBfs[%s: %s, %s: %s, %s: %d]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, NUM_PARTITIONS_OPTION, n));
    job.setJarByClass(EncodeBfsGraph.class);
    job.setNumReduceTasks(n);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BfsNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(BfsNode.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(ReduceClass.class);
    // Delete the output directory if it exists already.
    FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true);
    job.waitForCompletion(true);
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 65 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class AnalyzeBigramCount method main.

@SuppressWarnings({ "static-access" })
public static void main(String[] args) {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (!cmdline.hasOption(INPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(AnalyzeBigramCount.class.getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.exit(-1);
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    System.out.println("input path: " + inputPath);
    List<PairOfWritables<Text, IntWritable>> bigrams = SequenceFileUtils.readDirectory(new Path(inputPath));
    Collections.sort(bigrams, new Comparator<PairOfWritables<Text, IntWritable>>() {

        public int compare(PairOfWritables<Text, IntWritable> e1, PairOfWritables<Text, IntWritable> e2) {
            if (e2.getRightElement().compareTo(e1.getRightElement()) == 0) {
                return e1.getLeftElement().compareTo(e2.getLeftElement());
            }
            return e2.getRightElement().compareTo(e1.getRightElement());
        }
    });
    int singletons = 0;
    int sum = 0;
    for (PairOfWritables<Text, IntWritable> bigram : bigrams) {
        sum += bigram.getRightElement().get();
        if (bigram.getRightElement().get() == 1) {
            singletons++;
        }
    }
    System.out.println("total number of unique bigrams: " + bigrams.size());
    System.out.println("total number of bigrams: " + sum);
    System.out.println("number of bigrams that appear only once: " + singletons);
    System.out.println("\nten most frequent bigrams: ");
    Iterator<PairOfWritables<Text, IntWritable>> iter = Iterators.limit(bigrams.iterator(), 10);
    while (iter.hasNext()) {
        PairOfWritables<Text, IntWritable> bigram = iter.next();
        System.out.println(bigram.getLeftElement() + "\t" + bigram.getRightElement());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) PairOfWritables(tl.lin.data.pair.PairOfWritables) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) IntWritable(org.apache.hadoop.io.IntWritable)

Aggregations

CommandLineParser (org.apache.commons.cli.CommandLineParser)265 CommandLine (org.apache.commons.cli.CommandLine)246 Options (org.apache.commons.cli.Options)206 ParseException (org.apache.commons.cli.ParseException)186 GnuParser (org.apache.commons.cli.GnuParser)158 HelpFormatter (org.apache.commons.cli.HelpFormatter)111 PosixParser (org.apache.commons.cli.PosixParser)61 Option (org.apache.commons.cli.Option)52 IOException (java.io.IOException)48 Path (org.apache.hadoop.fs.Path)42 File (java.io.File)41 DefaultParser (org.apache.commons.cli.DefaultParser)29 Job (org.apache.hadoop.mapreduce.Job)27 Configuration (org.apache.hadoop.conf.Configuration)19 FileInputStream (java.io.FileInputStream)16 Properties (java.util.Properties)15 ArrayList (java.util.ArrayList)14 BasicParser (org.apache.commons.cli.BasicParser)14 FileSystem (org.apache.hadoop.fs.FileSystem)12 URI (java.net.URI)10