Search in sources :

Example 61 with GnuParser

use of org.apache.commons.cli.GnuParser in project Cloud9 by lintool.

the class AnalyzeBigramCount method main.

@SuppressWarnings({ "static-access" })
public static void main(String[] args) {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (!cmdline.hasOption(INPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(AnalyzeBigramCount.class.getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.exit(-1);
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    System.out.println("input path: " + inputPath);
    List<PairOfWritables<Text, IntWritable>> bigrams = SequenceFileUtils.readDirectory(new Path(inputPath));
    Collections.sort(bigrams, new Comparator<PairOfWritables<Text, IntWritable>>() {

        public int compare(PairOfWritables<Text, IntWritable> e1, PairOfWritables<Text, IntWritable> e2) {
            if (e2.getRightElement().compareTo(e1.getRightElement()) == 0) {
                return e1.getLeftElement().compareTo(e2.getLeftElement());
            }
            return e2.getRightElement().compareTo(e1.getRightElement());
        }
    });
    int singletons = 0;
    int sum = 0;
    for (PairOfWritables<Text, IntWritable> bigram : bigrams) {
        sum += bigram.getRightElement().get();
        if (bigram.getRightElement().get() == 1) {
            singletons++;
        }
    }
    System.out.println("total number of unique bigrams: " + bigrams.size());
    System.out.println("total number of bigrams: " + sum);
    System.out.println("number of bigrams that appear only once: " + singletons);
    System.out.println("\nten most frequent bigrams: ");
    Iterator<PairOfWritables<Text, IntWritable>> iter = Iterators.limit(bigrams.iterator(), 10);
    while (iter.hasNext()) {
        PairOfWritables<Text, IntWritable> bigram = iter.next();
        System.out.println(bigram.getLeftElement() + "\t" + bigram.getRightElement());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) PairOfWritables(tl.lin.data.pair.PairOfWritables) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) IntWritable(org.apache.hadoop.io.IntWritable)

Example 62 with GnuParser

use of org.apache.commons.cli.GnuParser in project Cloud9 by lintool.

the class AnalyzeBigramRelativeFrequencyJson method main.

@SuppressWarnings({ "static-access" })
public static void main(String[] args) {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (!cmdline.hasOption(INPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(AnalyzeBigramRelativeFrequencyJson.class.getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.exit(-1);
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    System.out.println("input path: " + inputPath);
    List<PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable>> pairs = SequenceFileUtils.readDirectory(new Path(inputPath));
    List<PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable>> list1 = Lists.newArrayList();
    List<PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable>> list2 = Lists.newArrayList();
    for (PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable> p : pairs) {
        BigramRelativeFrequencyJson.MyTuple bigram = p.getLeftElement();
        if (bigram.getJsonObject().get("Left").getAsString().equals("light")) {
            list1.add(p);
        }
        if (bigram.getJsonObject().get("Left").getAsString().equals("contain")) {
            list2.add(p);
        }
    }
    Collections.sort(list1, new Comparator<PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable>>() {

        public int compare(PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable> e1, PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable> e2) {
            if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
                return e1.getLeftElement().compareTo(e2.getLeftElement());
            }
            return e2.getRightElement().compareTo(e1.getRightElement());
        }
    });
    Iterator<PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable>> iter1 = Iterators.limit(list1.iterator(), 10);
    while (iter1.hasNext()) {
        PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable> p = iter1.next();
        BigramRelativeFrequencyJson.MyTuple bigram = p.getLeftElement();
        System.out.println(bigram + "\t" + p.getRightElement());
    }
    Collections.sort(list2, new Comparator<PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable>>() {

        public int compare(PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable> e1, PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable> e2) {
            if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
                return e1.getLeftElement().compareTo(e2.getLeftElement());
            }
            return e2.getRightElement().compareTo(e1.getRightElement());
        }
    });
    Iterator<PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable>> iter2 = Iterators.limit(list2.iterator(), 10);
    while (iter2.hasNext()) {
        PairOfWritables<BigramRelativeFrequencyJson.MyTuple, FloatWritable> p = iter2.next();
        BigramRelativeFrequencyJson.MyTuple bigram = p.getLeftElement();
        System.out.println(bigram + "\t" + p.getRightElement());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FloatWritable(org.apache.hadoop.io.FloatWritable) PairOfWritables(tl.lin.data.pair.PairOfWritables) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException)

Example 63 with GnuParser

use of org.apache.commons.cli.GnuParser in project Cloud9 by lintool.

the class SequentialPersonalizedPageRank method main.

@SuppressWarnings({ "static-access" })
public static void main(String[] args) throws IOException {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("val").hasArg().withDescription("random jump factor").create(JUMP));
    options.addOption(OptionBuilder.withArgName("node").hasArg().withDescription("source node (i.e., destination of the random jump)").create(SOURCE));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(SOURCE)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(SequentialPersonalizedPageRank.class.getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.exit(-1);
    }
    String infile = cmdline.getOptionValue(INPUT);
    final String source = cmdline.getOptionValue(SOURCE);
    float alpha = cmdline.hasOption(JUMP) ? Float.parseFloat(cmdline.getOptionValue(JUMP)) : 0.15f;
    int edgeCnt = 0;
    DirectedSparseGraph<String, Integer> graph = new DirectedSparseGraph<String, Integer>();
    BufferedReader data = new BufferedReader(new InputStreamReader(new FileInputStream(infile)));
    String line;
    while ((line = data.readLine()) != null) {
        line.trim();
        String[] arr = line.split("\\t");
        for (int i = 1; i < arr.length; i++) {
            graph.addEdge(new Integer(edgeCnt++), arr[0], arr[i]);
        }
    }
    data.close();
    if (!graph.containsVertex(source)) {
        System.err.println("Error: source node not found in the graph!");
        System.exit(-1);
    }
    WeakComponentClusterer<String, Integer> clusterer = new WeakComponentClusterer<String, Integer>();
    Set<Set<String>> components = clusterer.transform(graph);
    int numComponents = components.size();
    System.out.println("Number of components: " + numComponents);
    System.out.println("Number of edges: " + graph.getEdgeCount());
    System.out.println("Number of nodes: " + graph.getVertexCount());
    System.out.println("Random jump factor: " + alpha);
    // Compute personalized PageRank.
    PageRankWithPriors<String, Integer> ranker = new PageRankWithPriors<String, Integer>(graph, new Transformer<String, Double>() {

        @Override
        public Double transform(String vertex) {
            return vertex.equals(source) ? 1.0 : 0;
        }
    }, alpha);
    ranker.evaluate();
    // Use priority queue to sort vertices by PageRank values.
    PriorityQueue<Ranking<String>> q = new PriorityQueue<Ranking<String>>();
    int i = 0;
    for (String pmid : graph.getVertices()) {
        q.add(new Ranking<String>(i++, ranker.getVertexScore(pmid), pmid));
    }
    // Print PageRank values.
    System.out.println("\nPageRank of nodes, in descending order:");
    Ranking<String> r = null;
    while ((r = q.poll()) != null) {
        System.out.println(r.rankScore + "\t" + r.getRanked());
    }
}
Also used : Options(org.apache.commons.cli.Options) Set(java.util.Set) GnuParser(org.apache.commons.cli.GnuParser) PageRankWithPriors(edu.uci.ics.jung.algorithms.scoring.PageRankWithPriors) HelpFormatter(org.apache.commons.cli.HelpFormatter) Ranking(edu.uci.ics.jung.algorithms.importance.Ranking) CommandLineParser(org.apache.commons.cli.CommandLineParser) DirectedSparseGraph(edu.uci.ics.jung.graph.DirectedSparseGraph) InputStreamReader(java.io.InputStreamReader) PriorityQueue(java.util.PriorityQueue) FileInputStream(java.io.FileInputStream) WeakComponentClusterer(edu.uci.ics.jung.algorithms.cluster.WeakComponentClusterer) CommandLine(org.apache.commons.cli.CommandLine) BufferedReader(java.io.BufferedReader) ParseException(org.apache.commons.cli.ParseException)

Example 64 with GnuParser

use of org.apache.commons.cli.GnuParser in project Cloud9 by lintool.

the class ComputeCooccurrenceMatrixStripes method run.

/**
   * Runs this tool.
   */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
    int window = cmdline.hasOption(WINDOW) ? Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2;
    LOG.info("Tool: " + ComputeCooccurrenceMatrixStripes.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - window: " + window);
    LOG.info(" - number of reducers: " + reduceTasks);
    Job job = Job.getInstance(getConf());
    job.setJobName(ComputeCooccurrenceMatrixStripes.class.getSimpleName());
    job.setJarByClass(ComputeCooccurrenceMatrixStripes.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);
    job.getConfiguration().setInt("window", window);
    job.setNumReduceTasks(reduceTasks);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(HMapStIW.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(HMapStIW.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 65 with GnuParser

use of org.apache.commons.cli.GnuParser in project Cloud9 by lintool.

the class WikipediaPagesBz2InputStream method main.

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("gzipped XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("lang").hasArg().withDescription("output location").create(LANGUAGE_OPTION));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(WikipediaPagesBz2InputStream.class.getCanonicalName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.exit(-1);
    }
    String path = cmdline.getOptionValue(INPUT_OPTION);
    String lang = cmdline.hasOption(LANGUAGE_OPTION) ? cmdline.getOptionValue(LANGUAGE_OPTION) : "en";
    WikipediaPage p = WikipediaPageFactory.createWikipediaPage(lang);
    WikipediaPagesBz2InputStream stream = new WikipediaPagesBz2InputStream(path);
    while (stream.readNext(p)) {
        System.out.println(p.getTitle() + "\t" + p.getDocid());
    }
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException)

Aggregations

GnuParser (org.apache.commons.cli.GnuParser)208 CommandLine (org.apache.commons.cli.CommandLine)187 Options (org.apache.commons.cli.Options)165 CommandLineParser (org.apache.commons.cli.CommandLineParser)158 ParseException (org.apache.commons.cli.ParseException)139 HelpFormatter (org.apache.commons.cli.HelpFormatter)92 Path (org.apache.hadoop.fs.Path)40 Option (org.apache.commons.cli.Option)39 IOException (java.io.IOException)32 Job (org.apache.hadoop.mapreduce.Job)27 File (java.io.File)24 Configuration (org.apache.hadoop.conf.Configuration)19 FileInputStream (java.io.FileInputStream)14 ArrayList (java.util.ArrayList)14 Properties (java.util.Properties)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 MissingArgumentException (org.apache.commons.cli.MissingArgumentException)9 FileNotFoundException (java.io.FileNotFoundException)7 URI (java.net.URI)7 URISyntaxException (java.net.URISyntaxException)6