use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.
the class BigramCount method run.
/**
* Runs this tool.
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
String inputPath = cmdline.getOptionValue(INPUT);
String outputPath = cmdline.getOptionValue(OUTPUT);
int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
LOG.info("Tool name: " + BigramCount.class.getSimpleName());
LOG.info(" - input path: " + inputPath);
LOG.info(" - output path: " + outputPath);
LOG.info(" - num reducers: " + reduceTasks);
Job job = Job.getInstance(getConf());
job.setJobName(BigramCount.class.getSimpleName());
job.setJarByClass(BigramCount.class);
job.setNumReduceTasks(reduceTasks);
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(MyReducer.class);
job.setReducerClass(MyReducer.class);
// Delete the output directory if it exists already.
Path outputDir = new Path(outputPath);
FileSystem.get(getConf()).delete(outputDir, true);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
return 0;
}
use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.
the class BigramRelativeFrequencyJson method run.
/**
* Runs this tool.
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
String inputPath = cmdline.getOptionValue(INPUT);
String outputPath = cmdline.getOptionValue(OUTPUT);
int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
LOG.info("Tool name: " + BigramRelativeFrequencyJson.class.getSimpleName());
LOG.info(" - input path: " + inputPath);
LOG.info(" - output path: " + outputPath);
LOG.info(" - num reducers: " + reduceTasks);
Job job = Job.getInstance(getConf());
job.setJobName(BigramRelativeFrequencyJson.class.getSimpleName());
job.setJarByClass(BigramRelativeFrequencyJson.class);
job.setNumReduceTasks(reduceTasks);
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setMapOutputKeyClass(MyTuple.class);
job.setMapOutputValueClass(FloatWritable.class);
job.setOutputKeyClass(MyTuple.class);
job.setOutputValueClass(FloatWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(MyCombiner.class);
job.setReducerClass(MyReducer.class);
job.setPartitionerClass(MyPartitioner.class);
// Delete the output directory if it exists already.
Path outputDir = new Path(outputPath);
FileSystem.get(getConf()).delete(outputDir, true);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
return 0;
}
use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.
the class BigramRelativeFrequencyTuple method run.
/**
* Runs this tool.
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
String inputPath = cmdline.getOptionValue(INPUT);
String outputPath = cmdline.getOptionValue(OUTPUT);
int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
LOG.info("Tool name: " + BigramRelativeFrequencyTuple.class.getSimpleName());
LOG.info(" - input path: " + inputPath);
LOG.info(" - output path: " + outputPath);
LOG.info(" - num reducers: " + reduceTasks);
Job job = Job.getInstance(getConf());
job.setJobName(BigramRelativeFrequencyTuple.class.getSimpleName());
job.setJarByClass(BigramRelativeFrequencyTuple.class);
job.setNumReduceTasks(reduceTasks);
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setMapOutputKeyClass(BinSedesTuple.class);
job.setMapOutputValueClass(FloatWritable.class);
job.setOutputKeyClass(BinSedesTuple.class);
job.setOutputValueClass(FloatWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(MyCombiner.class);
job.setReducerClass(MyReducer.class);
job.setPartitionerClass(MyPartitioner.class);
// Delete the output directory if it exists already.
Path outputDir = new Path(outputPath);
FileSystem.get(getConf()).delete(outputDir, true);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
return 0;
}
use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.
the class IterateGMM method run.
/**
* Runs this tool.
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers").create(NUM_REDUCERS));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
String inputPath0 = cmdline.getOptionValue(INPUT);
String outputPath = cmdline.getOptionValue(OUTPUT);
int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1;
LOG.info("Tool: " + IterateGMM.class.getSimpleName());
LOG.info(" - input path: " + inputPath0);
String inputPath = inputPath0 + "/points";
LOG.info(" - output path: " + outputPath);
LOG.info(" - number of reducers: " + reduceTasks);
int iterations = 0;
Configuration conf = getConf();
while (iterations == 0 || !FinishIteration(inputPath0, iterations, conf)) {
LOG.info("** iterations: " + iterations);
try {
Job job = Job.getInstance(conf);
job.setJobName(IterateGMM.class.getSimpleName());
job.setJarByClass(IterateGMM.class);
// set the path of the information of k clusters in this iteration
job.getConfiguration().set("clusterpath", inputPath0 + "/cluster" + iterations);
job.setNumReduceTasks(reduceTasks);
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(PairOfStrings.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setPartitionerClass(MyPartitioner.class);
// Delete the output directory if it exists already.
Path outputDir = new Path(outputPath);
FileSystem.get(getConf()).delete(outputDir, true);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
reNameFile(inputPath0, outputPath, iterations + 1, conf, reduceTasks);
} catch (Exception exp) {
exp.printStackTrace();
}
iterations++;
}
return 0;
}
use of org.apache.commons.cli.CommandLineParser in project Cloud9 by lintool.
the class LocalClusteringDriver method main.
// private static final String input="points_input";
@SuppressWarnings({ "static-access" })
public static void main(String[] args) {
Options options = new Options();
options.addOption(new Option(KMEANS, "initialize with k-means"));
options.addOption(new Option(HELP, "display help options"));
options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("input path").create(POINTS));
options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("output path").create(COMPONENTS));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("result path").create(OUTPUT));
CommandLine cmdline = null;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
}
if (!cmdline.hasOption(OUTPUT)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(LocalClusteringDriver.class.getName(), options);
System.exit(-1);
}
if (cmdline.hasOption(HELP)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(LocalClusteringDriver.class.getName(), options);
System.exit(-1);
}
int numComponents = cmdline.hasOption(COMPONENTS) ? Integer.parseInt(cmdline.getOptionValue(COMPONENTS)) : 3;
int numPoints = cmdline.hasOption(POINTS) ? Integer.parseInt(cmdline.getOptionValue(POINTS)) : 100000;
String output = cmdline.getOptionValue(OUTPUT);
System.out.println(output);
System.out.println("Number of points: " + numPoints);
System.out.println("Number of components in mixture: " + numComponents);
UnivariateGaussianMixtureModel sourceModel = new UnivariateGaussianMixtureModel(numComponents);
for (int i = 0; i < numComponents; i++) {
PVector param = new PVector(2);
param.array[0] = RANDOM.nextInt(100);
param.array[1] = RANDOM.nextFloat() * 3;
sourceModel.param[i] = param;
sourceModel.weight[i] = RANDOM.nextInt(10) + 1;
}
sourceModel.normalizeWeights();
System.out.println("Initial mixture model:\n" + sourceModel + "\n");
// Draw points from initial mixture model and compute the n clusters
Point[] points = sourceModel.drawRandomPoints(numPoints);
UnivariateGaussianMixtureModel learnedModel = null;
if (cmdline.hasOption(KMEANS)) {
System.out.println("Running k-means to initialize clusters...");
List<Point>[] clusters = KMeans.run(points, numComponents);
double[] means = new double[numComponents];
int cnt = 0;
for (List<Point> cluster : clusters) {
double tmp = 0.0;
for (Point p : cluster) {
tmp += p.value;
}
means[cnt] = tmp / cluster.size();
cnt++;
}
System.out.println("Cluster means: " + Arrays.toString(means) + "\n");
learnedModel = ExpectationMaximization.initialize(points, means);
} else {
learnedModel = ExpectationMaximization.initialize(points, numComponents);
}
Path outputPoi = new Path(output);
try {
FileSystem fs = FileSystem.get(new Configuration());
fs.delete(outputPoi, true);
FSDataOutputStream pointfile = fs.create(new Path(output + "/points"));
for (int i = 0; i < numPoints; i++) {
pointfile.write((Double.toString(points[i].value) + "\n").getBytes());
}
pointfile.flush();
pointfile.close();
FSDataOutputStream clusterfile = fs.create(new Path(output + "/cluster0"));
for (int i = 0; i < numComponents; i++) {
clusterfile.write((i + " " + Double.toString(learnedModel.weight[i]) + " " + learnedModel.param[i].array[0] + " " + learnedModel.param[i].array[1] + "\n").getBytes());
}
clusterfile.flush();
clusterfile.close();
} catch (IOException exp) {
exp.printStackTrace();
}
System.out.println("** Ready to run EM **\n");
System.out.println("Initial mixture model:\n" + learnedModel + "\n");
learnedModel = ExpectationMaximization.run(points, learnedModel);
System.out.println("Mixure model estimated using EM: \n" + learnedModel + "\n");
}
Aggregations