Search in sources :

Example 51 with GnuParser

use of org.apache.commons.cli.GnuParser in project Cloud9 by lintool.

the class LocalClusteringDriver method main.

//  private static final String input="points_input";
@SuppressWarnings({ "static-access" })
public static void main(String[] args) {
    Options options = new Options();
    options.addOption(new Option(KMEANS, "initialize with k-means"));
    options.addOption(new Option(HELP, "display help options"));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("input path").create(POINTS));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("output path").create(COMPONENTS));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("result path").create(OUTPUT));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
    }
    if (!cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(LocalClusteringDriver.class.getName(), options);
        System.exit(-1);
    }
    if (cmdline.hasOption(HELP)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(LocalClusteringDriver.class.getName(), options);
        System.exit(-1);
    }
    int numComponents = cmdline.hasOption(COMPONENTS) ? Integer.parseInt(cmdline.getOptionValue(COMPONENTS)) : 3;
    int numPoints = cmdline.hasOption(POINTS) ? Integer.parseInt(cmdline.getOptionValue(POINTS)) : 100000;
    String output = cmdline.getOptionValue(OUTPUT);
    System.out.println(output);
    System.out.println("Number of points: " + numPoints);
    System.out.println("Number of components in mixture: " + numComponents);
    UnivariateGaussianMixtureModel sourceModel = new UnivariateGaussianMixtureModel(numComponents);
    for (int i = 0; i < numComponents; i++) {
        PVector param = new PVector(2);
        param.array[0] = RANDOM.nextInt(100);
        param.array[1] = RANDOM.nextFloat() * 3;
        sourceModel.param[i] = param;
        sourceModel.weight[i] = RANDOM.nextInt(10) + 1;
    }
    sourceModel.normalizeWeights();
    System.out.println("Initial mixture model:\n" + sourceModel + "\n");
    // Draw points from initial mixture model and compute the n clusters
    Point[] points = sourceModel.drawRandomPoints(numPoints);
    UnivariateGaussianMixtureModel learnedModel = null;
    if (cmdline.hasOption(KMEANS)) {
        System.out.println("Running k-means to initialize clusters...");
        List<Point>[] clusters = KMeans.run(points, numComponents);
        double[] means = new double[numComponents];
        int cnt = 0;
        for (List<Point> cluster : clusters) {
            double tmp = 0.0;
            for (Point p : cluster) {
                tmp += p.value;
            }
            means[cnt] = tmp / cluster.size();
            cnt++;
        }
        System.out.println("Cluster means: " + Arrays.toString(means) + "\n");
        learnedModel = ExpectationMaximization.initialize(points, means);
    } else {
        learnedModel = ExpectationMaximization.initialize(points, numComponents);
    }
    Path outputPoi = new Path(output);
    try {
        FileSystem fs = FileSystem.get(new Configuration());
        fs.delete(outputPoi, true);
        FSDataOutputStream pointfile = fs.create(new Path(output + "/points"));
        for (int i = 0; i < numPoints; i++) {
            pointfile.write((Double.toString(points[i].value) + "\n").getBytes());
        }
        pointfile.flush();
        pointfile.close();
        FSDataOutputStream clusterfile = fs.create(new Path(output + "/cluster0"));
        for (int i = 0; i < numComponents; i++) {
            clusterfile.write((i + " " + Double.toString(learnedModel.weight[i]) + " " + learnedModel.param[i].array[0] + " " + learnedModel.param[i].array[1] + "\n").getBytes());
        }
        clusterfile.flush();
        clusterfile.close();
    } catch (IOException exp) {
        exp.printStackTrace();
    }
    System.out.println("** Ready to run EM **\n");
    System.out.println("Initial mixture model:\n" + learnedModel + "\n");
    learnedModel = ExpectationMaximization.run(points, learnedModel);
    System.out.println("Mixure model estimated using EM: \n" + learnedModel + "\n");
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) GnuParser(org.apache.commons.cli.GnuParser) IOException(java.io.IOException) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) Option(org.apache.commons.cli.Option) List(java.util.List) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 52 with GnuParser

use of org.apache.commons.cli.GnuParser in project Cloud9 by lintool.

the class BooleanRetrieval method run.

/**
   * Runs this tool.
   */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(COLLECTION));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(BooleanRetrieval.class.getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.exit(-1);
    }
    String indexPath = cmdline.getOptionValue(INDEX);
    String collectionPath = cmdline.getOptionValue(COLLECTION);
    if (collectionPath.endsWith(".gz")) {
        System.out.println("gzipped collection is not seekable: use compressed version!");
        System.exit(-1);
    }
    FileSystem fs = FileSystem.get(new Configuration());
    initialize(indexPath, collectionPath, fs);
    String[] queries = { "outrageous fortune AND", "white rose AND", "means deceit AND", "white red OR rose AND pluck AND", "unhappy outrageous OR good your AND OR fortune AND" };
    for (String q : queries) {
        System.out.println("Query: " + q);
        runQuery(q);
        System.out.println("");
    }
    return 1;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException)

Example 53 with GnuParser

use of org.apache.commons.cli.GnuParser in project Cloud9 by lintool.

the class BuildInvertedIndex method run.

/**
   * Runs this tool.
   */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
    LOG.info("Tool name: " + BuildInvertedIndex.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - num reducers: " + reduceTasks);
    Job job = Job.getInstance(getConf());
    job.setJobName(BuildInvertedIndex.class.getSimpleName());
    job.setJarByClass(BuildInvertedIndex.class);
    job.setNumReduceTasks(reduceTasks);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfInts.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfWritables.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 54 with GnuParser

use of org.apache.commons.cli.GnuParser in project Cloud9 by lintool.

the class BuildPageRankRecords method run.

/**
   * Runs this tool.
   */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES));
    LOG.info("Tool name: " + BuildPageRankRecords.class.getSimpleName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - numNodes: " + n);
    Configuration conf = getConf();
    conf.setInt(NODE_CNT_FIELD, n);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    Job job = Job.getInstance(conf);
    job.setJobName(BuildPageRankRecords.class.getSimpleName() + ":" + inputPath);
    job.setJarByClass(BuildPageRankRecords.class);
    job.setNumReduceTasks(0);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);
    job.setMapperClass(MyMapper.class);
    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);
    job.waitForCompletion(true);
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) Configuration(org.apache.hadoop.conf.Configuration) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 55 with GnuParser

use of org.apache.commons.cli.GnuParser in project Cloud9 by lintool.

the class DumpPageRankRecordsToPlainText method run.

/**
   * Runs this tool.
   */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    LOG.info("Tool name: " + DumpPageRankRecordsToPlainText.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);
    Configuration conf = new Configuration();
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    Job job = Job.getInstance(conf);
    job.setJobName(DumpPageRankRecordsToPlainText.class.getSimpleName());
    job.setJarByClass(DumpPageRankRecordsToPlainText.class);
    job.setNumReduceTasks(0);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);
    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);
    job.waitForCompletion(true);
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) Configuration(org.apache.hadoop.conf.Configuration) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Aggregations

GnuParser (org.apache.commons.cli.GnuParser)208 CommandLine (org.apache.commons.cli.CommandLine)187 Options (org.apache.commons.cli.Options)165 CommandLineParser (org.apache.commons.cli.CommandLineParser)158 ParseException (org.apache.commons.cli.ParseException)139 HelpFormatter (org.apache.commons.cli.HelpFormatter)92 Path (org.apache.hadoop.fs.Path)40 Option (org.apache.commons.cli.Option)39 IOException (java.io.IOException)32 Job (org.apache.hadoop.mapreduce.Job)27 File (java.io.File)24 Configuration (org.apache.hadoop.conf.Configuration)19 FileInputStream (java.io.FileInputStream)14 ArrayList (java.util.ArrayList)14 Properties (java.util.Properties)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 MissingArgumentException (org.apache.commons.cli.MissingArgumentException)9 FileNotFoundException (java.io.FileNotFoundException)7 URI (java.net.URI)7 URISyntaxException (java.net.URISyntaxException)6