Search in sources :

Example 61 with HelpFormatter

use of org.apache.commons.cli.HelpFormatter in project Cloud9 by lintool.

the class SequentialPersonalizedPageRank method main.

@SuppressWarnings({ "static-access" })
public static void main(String[] args) throws IOException {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("val").hasArg().withDescription("random jump factor").create(JUMP));
    options.addOption(OptionBuilder.withArgName("node").hasArg().withDescription("source node (i.e., destination of the random jump)").create(SOURCE));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(SOURCE)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(SequentialPersonalizedPageRank.class.getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.exit(-1);
    }
    String infile = cmdline.getOptionValue(INPUT);
    final String source = cmdline.getOptionValue(SOURCE);
    float alpha = cmdline.hasOption(JUMP) ? Float.parseFloat(cmdline.getOptionValue(JUMP)) : 0.15f;
    int edgeCnt = 0;
    DirectedSparseGraph<String, Integer> graph = new DirectedSparseGraph<String, Integer>();
    BufferedReader data = new BufferedReader(new InputStreamReader(new FileInputStream(infile)));
    String line;
    while ((line = data.readLine()) != null) {
        line.trim();
        String[] arr = line.split("\\t");
        for (int i = 1; i < arr.length; i++) {
            graph.addEdge(new Integer(edgeCnt++), arr[0], arr[i]);
        }
    }
    data.close();
    if (!graph.containsVertex(source)) {
        System.err.println("Error: source node not found in the graph!");
        System.exit(-1);
    }
    WeakComponentClusterer<String, Integer> clusterer = new WeakComponentClusterer<String, Integer>();
    Set<Set<String>> components = clusterer.transform(graph);
    int numComponents = components.size();
    System.out.println("Number of components: " + numComponents);
    System.out.println("Number of edges: " + graph.getEdgeCount());
    System.out.println("Number of nodes: " + graph.getVertexCount());
    System.out.println("Random jump factor: " + alpha);
    // Compute personalized PageRank.
    PageRankWithPriors<String, Integer> ranker = new PageRankWithPriors<String, Integer>(graph, new Transformer<String, Double>() {

        @Override
        public Double transform(String vertex) {
            return vertex.equals(source) ? 1.0 : 0;
        }
    }, alpha);
    ranker.evaluate();
    // Use priority queue to sort vertices by PageRank values.
    PriorityQueue<Ranking<String>> q = new PriorityQueue<Ranking<String>>();
    int i = 0;
    for (String pmid : graph.getVertices()) {
        q.add(new Ranking<String>(i++, ranker.getVertexScore(pmid), pmid));
    }
    // Print PageRank values.
    System.out.println("\nPageRank of nodes, in descending order:");
    Ranking<String> r = null;
    while ((r = q.poll()) != null) {
        System.out.println(r.rankScore + "\t" + r.getRanked());
    }
}
Also used : Options(org.apache.commons.cli.Options) Set(java.util.Set) GnuParser(org.apache.commons.cli.GnuParser) PageRankWithPriors(edu.uci.ics.jung.algorithms.scoring.PageRankWithPriors) HelpFormatter(org.apache.commons.cli.HelpFormatter) Ranking(edu.uci.ics.jung.algorithms.importance.Ranking) CommandLineParser(org.apache.commons.cli.CommandLineParser) DirectedSparseGraph(edu.uci.ics.jung.graph.DirectedSparseGraph) InputStreamReader(java.io.InputStreamReader) PriorityQueue(java.util.PriorityQueue) FileInputStream(java.io.FileInputStream) WeakComponentClusterer(edu.uci.ics.jung.algorithms.cluster.WeakComponentClusterer) CommandLine(org.apache.commons.cli.CommandLine) BufferedReader(java.io.BufferedReader) ParseException(org.apache.commons.cli.ParseException)

Example 62 with HelpFormatter

use of org.apache.commons.cli.HelpFormatter in project Cloud9 by lintool.

the class ComputeCooccurrenceMatrixStripes method run.

/**
   * Runs this tool.
   */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
    int window = cmdline.hasOption(WINDOW) ? Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2;
    LOG.info("Tool: " + ComputeCooccurrenceMatrixStripes.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - window: " + window);
    LOG.info(" - number of reducers: " + reduceTasks);
    Job job = Job.getInstance(getConf());
    job.setJobName(ComputeCooccurrenceMatrixStripes.class.getSimpleName());
    job.setJarByClass(ComputeCooccurrenceMatrixStripes.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);
    job.getConfiguration().setInt("window", window);
    job.setNumReduceTasks(reduceTasks);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(HMapStIW.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(HMapStIW.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 63 with HelpFormatter

use of org.apache.commons.cli.HelpFormatter in project Cloud9 by lintool.

the class WikipediaPagesBz2InputStream method main.

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("gzipped XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("lang").hasArg().withDescription("output location").create(LANGUAGE_OPTION));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(WikipediaPagesBz2InputStream.class.getCanonicalName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.exit(-1);
    }
    String path = cmdline.getOptionValue(INPUT_OPTION);
    String lang = cmdline.hasOption(LANGUAGE_OPTION) ? cmdline.getOptionValue(LANGUAGE_OPTION) : "en";
    WikipediaPage p = WikipediaPageFactory.createWikipediaPage(lang);
    WikipediaPagesBz2InputStream stream = new WikipediaPagesBz2InputStream(path);
    while (stream.readNext(p)) {
        System.out.println(p.getTitle() + "\t" + p.getDocid());
    }
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException)

Example 64 with HelpFormatter

use of org.apache.commons.cli.HelpFormatter in project Cloud9 by lintool.

the class DumpWikipediaToPlainText method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg().withDescription("two-letter language code").create(LANGUAGE_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    // Assume "en" by default.
    String language = "en";
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }
    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - language: " + language);
    Job job = Job.getInstance(getConf());
    job.setJarByClass(DumpWikipediaToPlainText.class);
    job.setJobName(String.format("DumpWikipediaToPlainText[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, LANGUAGE_OPTION, language));
    job.setNumReduceTasks(0);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }
    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);
    job.waitForCompletion(true);
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 65 with HelpFormatter

use of org.apache.commons.cli.HelpFormatter in project Cloud9 by lintool.

the class WikipediaDocnoMappingBuilder method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file").create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg().withDescription("two-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }
    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);
    String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);
    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);
    Job job = Job.getInstance(getConf());
    job.setJarByClass(WikipediaDocnoMappingBuilder.class);
    job.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));
    job.getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }
    job.setNumReduceTasks(1);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(job, false);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);
    job.waitForCompletion(true);
    long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue() : job.getCounters().findCounter(PageTypes.ARTICLE).getValue();
    WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-r-00000", (int) cnt, outputFile);
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Aggregations

HelpFormatter (org.apache.commons.cli.HelpFormatter)273 Options (org.apache.commons.cli.Options)136 CommandLine (org.apache.commons.cli.CommandLine)126 CommandLineParser (org.apache.commons.cli.CommandLineParser)110 ParseException (org.apache.commons.cli.ParseException)103 GnuParser (org.apache.commons.cli.GnuParser)92 Path (org.apache.hadoop.fs.Path)42 PrintWriter (java.io.PrintWriter)35 Option (org.apache.commons.cli.Option)29 Job (org.apache.hadoop.mapreduce.Job)27 Configuration (org.apache.hadoop.conf.Configuration)21 File (java.io.File)17 IOException (java.io.IOException)14 DefaultParser (org.apache.commons.cli.DefaultParser)13 PosixParser (org.apache.commons.cli.PosixParser)12 FileSystem (org.apache.hadoop.fs.FileSystem)12 BasicParser (org.apache.commons.cli.BasicParser)11 ArrayList (java.util.ArrayList)8 URI (java.net.URI)6 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6