Search in sources :

Example 51 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class BigramCount method run.

/**
   * Runs this tool.
   */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
    LOG.info("Tool name: " + BigramCount.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - num reducers: " + reduceTasks);
    Job job = Job.getInstance(getConf());
    job.setJobName(BigramCount.class.getSimpleName());
    job.setJarByClass(BigramCount.class);
    job.setNumReduceTasks(reduceTasks);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 52 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class BigramRelativeFrequencyJson method run.

/**
   * Runs this tool.
   */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
    LOG.info("Tool name: " + BigramRelativeFrequencyJson.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - num reducers: " + reduceTasks);
    Job job = Job.getInstance(getConf());
    job.setJobName(BigramRelativeFrequencyJson.class.getSimpleName());
    job.setJarByClass(BigramRelativeFrequencyJson.class);
    job.setNumReduceTasks(reduceTasks);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setMapOutputKeyClass(MyTuple.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setOutputKeyClass(MyTuple.class);
    job.setOutputValueClass(FloatWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 53 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class BigramRelativeFrequencyTuple method run.

/**
   * Runs this tool.
   */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
    LOG.info("Tool name: " + BigramRelativeFrequencyTuple.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - num reducers: " + reduceTasks);
    Job job = Job.getInstance(getConf());
    job.setJobName(BigramRelativeFrequencyTuple.class.getSimpleName());
    job.setJarByClass(BigramRelativeFrequencyTuple.class);
    job.setNumReduceTasks(reduceTasks);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setMapOutputKeyClass(BinSedesTuple.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setOutputKeyClass(BinSedesTuple.class);
    job.setOutputValueClass(FloatWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);
    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}
Also used : HelpFormatter(org.apache.commons.cli.HelpFormatter) Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) CommandLine(org.apache.commons.cli.CommandLine) GnuParser(org.apache.commons.cli.GnuParser) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 54 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class IterateGMM method run.

/**
   * Runs this tool.
   */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String inputPath0 = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
    LOG.info("Tool: " + IterateGMM.class.getSimpleName());
    LOG.info(" - input path: " + inputPath0);
    String inputPath = inputPath0 + "/points";
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    int iterations = 0;
    Configuration conf = getConf();
    while (iterations == 0 || !FinishIteration(inputPath0, iterations, conf)) {
        LOG.info("** iterations: " + iterations);
        try {
            Job job = Job.getInstance(conf);
            job.setJobName(IterateGMM.class.getSimpleName());
            job.setJarByClass(IterateGMM.class);
            // set the path of the information of k clusters in this iteration
            job.getConfiguration().set("clusterpath", inputPath0 + "/cluster" + iterations);
            job.setNumReduceTasks(reduceTasks);
            FileInputFormat.setInputPaths(job, new Path(inputPath));
            FileOutputFormat.setOutputPath(job, new Path(outputPath));
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(PairOfStrings.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            job.setMapperClass(MyMapper.class);
            job.setReducerClass(MyReducer.class);
            job.setPartitionerClass(MyPartitioner.class);
            // Delete the output directory if it exists already.
            Path outputDir = new Path(outputPath);
            FileSystem.get(getConf()).delete(outputDir, true);
            long startTime = System.currentTimeMillis();
            job.waitForCompletion(true);
            LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
            reNameFile(inputPath0, outputPath, iterations + 1, conf, reduceTasks);
        } catch (Exception exp) {
            exp.printStackTrace();
        }
        iterations++;
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) GnuParser(org.apache.commons.cli.GnuParser) IOException(java.io.IOException) ParseException(org.apache.commons.cli.ParseException) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Job(org.apache.hadoop.mapreduce.Job)

Example 55 with CommandLineParser

use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.

the class LocalClusteringDriver method main.

//  private static final String input="points_input";
@SuppressWarnings({ "static-access" })
public static void main(String[] args) {
    Options options = new Options();
    options.addOption(new Option(KMEANS, "initialize with k-means"));
    options.addOption(new Option(HELP, "display help options"));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("input path").create(POINTS));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("output path").create(COMPONENTS));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("result path").create(OUTPUT));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
    }
    if (!cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(LocalClusteringDriver.class.getName(), options);
        System.exit(-1);
    }
    if (cmdline.hasOption(HELP)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(LocalClusteringDriver.class.getName(), options);
        System.exit(-1);
    }
    int numComponents = cmdline.hasOption(COMPONENTS) ? Integer.parseInt(cmdline.getOptionValue(COMPONENTS)) : 3;
    int numPoints = cmdline.hasOption(POINTS) ? Integer.parseInt(cmdline.getOptionValue(POINTS)) : 100000;
    String output = cmdline.getOptionValue(OUTPUT);
    System.out.println(output);
    System.out.println("Number of points: " + numPoints);
    System.out.println("Number of components in mixture: " + numComponents);
    UnivariateGaussianMixtureModel sourceModel = new UnivariateGaussianMixtureModel(numComponents);
    for (int i = 0; i < numComponents; i++) {
        PVector param = new PVector(2);
        param.array[0] = RANDOM.nextInt(100);
        param.array[1] = RANDOM.nextFloat() * 3;
        sourceModel.param[i] = param;
        sourceModel.weight[i] = RANDOM.nextInt(10) + 1;
    }
    sourceModel.normalizeWeights();
    System.out.println("Initial mixture model:\n" + sourceModel + "\n");
    // Draw points from initial mixture model and compute the n clusters
    Point[] points = sourceModel.drawRandomPoints(numPoints);
    UnivariateGaussianMixtureModel learnedModel = null;
    if (cmdline.hasOption(KMEANS)) {
        System.out.println("Running k-means to initialize clusters...");
        List<Point>[] clusters = KMeans.run(points, numComponents);
        double[] means = new double[numComponents];
        int cnt = 0;
        for (List<Point> cluster : clusters) {
            double tmp = 0.0;
            for (Point p : cluster) {
                tmp += p.value;
            }
            means[cnt] = tmp / cluster.size();
            cnt++;
        }
        System.out.println("Cluster means: " + Arrays.toString(means) + "\n");
        learnedModel = ExpectationMaximization.initialize(points, means);
    } else {
        learnedModel = ExpectationMaximization.initialize(points, numComponents);
    }
    Path outputPoi = new Path(output);
    try {
        FileSystem fs = FileSystem.get(new Configuration());
        fs.delete(outputPoi, true);
        FSDataOutputStream pointfile = fs.create(new Path(output + "/points"));
        for (int i = 0; i < numPoints; i++) {
            pointfile.write((Double.toString(points[i].value) + "\n").getBytes());
        }
        pointfile.flush();
        pointfile.close();
        FSDataOutputStream clusterfile = fs.create(new Path(output + "/cluster0"));
        for (int i = 0; i < numComponents; i++) {
            clusterfile.write((i + " " + Double.toString(learnedModel.weight[i]) + " " + learnedModel.param[i].array[0] + " " + learnedModel.param[i].array[1] + "\n").getBytes());
        }
        clusterfile.flush();
        clusterfile.close();
    } catch (IOException exp) {
        exp.printStackTrace();
    }
    System.out.println("** Ready to run EM **\n");
    System.out.println("Initial mixture model:\n" + learnedModel + "\n");
    learnedModel = ExpectationMaximization.run(points, learnedModel);
    System.out.println("Mixure model estimated using EM: \n" + learnedModel + "\n");
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) GnuParser(org.apache.commons.cli.GnuParser) IOException(java.io.IOException) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) Option(org.apache.commons.cli.Option) List(java.util.List) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Aggregations

CommandLineParser (org.apache.commons.cli.CommandLineParser)265 CommandLine (org.apache.commons.cli.CommandLine)246 Options (org.apache.commons.cli.Options)206 ParseException (org.apache.commons.cli.ParseException)186 GnuParser (org.apache.commons.cli.GnuParser)158 HelpFormatter (org.apache.commons.cli.HelpFormatter)111 PosixParser (org.apache.commons.cli.PosixParser)61 Option (org.apache.commons.cli.Option)52 IOException (java.io.IOException)48 Path (org.apache.hadoop.fs.Path)42 File (java.io.File)41 DefaultParser (org.apache.commons.cli.DefaultParser)29 Job (org.apache.hadoop.mapreduce.Job)27 Configuration (org.apache.hadoop.conf.Configuration)19 FileInputStream (java.io.FileInputStream)16 Properties (java.util.Properties)15 ArrayList (java.util.ArrayList)14 BasicParser (org.apache.commons.cli.BasicParser)14 FileSystem (org.apache.hadoop.fs.FileSystem)12 URI (java.net.URI)10