use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.
the class CombineSplits method run.
public int run() throws ClassNotFoundException, InterruptedException, IOException {
Configuration conf = getConf();
String examplesPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExamplesPath", conf);
String exampleStatsPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExampleStatsPath", conf);
String splitKey = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.SplitKey", conf).toLowerCase();
int numSplits = conf.getInt("Mavuno.CombineSplits.TotalSplits", 1);
String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.OutputPath", conf);
sLogger.info("Tool name: CombineSplits");
sLogger.info(" - Examples path: " + examplesPath);
sLogger.info(" - Example stats path: " + exampleStatsPath);
sLogger.info(" - Split key: " + splitKey);
sLogger.info(" - Total splits: " + numSplits);
sLogger.info(" - Output path: " + outputPath);
Job job = new Job(conf);
job.setJobName("CombineSplits");
job.setJarByClass(CombineSplits.class);
for (int split = 0; split < numSplits; split++) {
FileInputFormat.addInputPath(job, new Path(examplesPath + "/" + split));
}
if (MavunoUtils.pathExists(conf, exampleStatsPath)) {
FileInputFormat.addInputPath(job, new Path(exampleStatsPath));
}
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileOutputFormat.setCompressOutput(job, true);
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
job.setMapOutputKeyClass(ContextPatternWritable.class);
if ("pattern".equals(splitKey)) {
job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
} else if ("context".equals(splitKey)) {
job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
} else if ("pattern+context".equals(splitKey)) {
job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
} else {
throw new RuntimeException("Invalid SplitKey in CombineSplits! -- " + splitKey);
}
job.setMapOutputValueClass(ContextPatternStatsWritable.class);
job.setOutputKeyClass(ContextPatternWritable.class);
job.setOutputValueClass(ContextPatternStatsWritable.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.waitForCompletion(true);
return 0;
}
use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.
the class ExtractGlobalStats method run.
@SuppressWarnings("unchecked")
public int run() throws ClassNotFoundException, InterruptedException, IOException {
Configuration conf = getConf();
String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf);
String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf);
String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf);
String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf);
String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf);
String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf).toLowerCase();
String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf);
// split examples
conf.set("Mavuno.Split.InputPath", inputPath);
conf.set("Mavuno.Split.OutputPath", outputPath + "/../split");
conf.set("Mavuno.Split.SplitKey", extractorTarget);
new Split(conf).run();
// get splits
FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split");
int split = 0;
for (FileStatus file : files) {
if (!file.getPath().getName().endsWith(".examples")) {
continue;
}
conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString());
sLogger.info("Tool name: ExtractGlobalStats");
sLogger.info(" - Input path: " + inputPath);
sLogger.info(" - Examples path: " + file.getPath());
sLogger.info(" - Example split: " + split);
sLogger.info(" - Corpus path: " + corpusPath);
sLogger.info(" - Corpus class: " + corpusClass);
sLogger.info(" - Extractor class: " + extractorClass);
sLogger.info(" - Extractor class: " + extractorArgs);
sLogger.info(" - Extractor target: " + extractorTarget);
sLogger.info(" - Output path: " + outputPath);
Job job = new Job(conf);
job.setJobName("ExtractGlobalStats");
job.setJarByClass(ExtractGlobalStats.class);
MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split));
job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileOutputFormat.setCompressOutput(job, true);
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
job.setMapOutputKeyClass(ContextPatternWritable.class);
job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
job.setMapOutputValueClass(ContextPatternStatsWritable.class);
job.setOutputKeyClass(ContextPatternWritable.class);
job.setOutputValueClass(ContextPatternStatsWritable.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.waitForCompletion(true);
split++;
}
// combine splits
conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split);
conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/");
conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath);
new CombineGlobalStats(conf).run();
MavunoUtils.removeDirectory(conf, outputPath + "/../split");
return 0;
}
use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.
the class HarvestUDAPInstances method run.
@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
Configuration conf = getConf();
String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.CorpusPath", conf);
String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.CorpusClass", conf);
String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.OutputPath", conf);
sLogger.info("Tool name: HarvestUDAPInstances");
sLogger.info(" - Corpus path: " + corpusPath);
sLogger.info(" - Corpus class: " + corpusClass);
sLogger.info(" - Output path: " + outputPath);
Job job = new Job(conf);
job.setJobName("HarvestUDAPInstances");
MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.waitForCompletion(true);
return 0;
}
use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.
the class GetTopResults method run.
public int run() throws ClassNotFoundException, InterruptedException, IOException {
Configuration conf = getConf();
String inputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.InputPath", conf);
String outputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.OutputPath", conf);
int numResults = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.GetTopResults.NumResults", conf));
boolean sequenceFileOutputFormat = conf.getBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false);
sLogger.info("Tool name: GetTopResults");
sLogger.info(" - Input path: " + inputPath);
sLogger.info(" - Number of results: " + numResults);
sLogger.info(" - Output path: " + outputPath);
Job job = new Job(conf);
job.setJobName("GetTopResults");
job.setJarByClass(GetTopResults.class);
FileInputFormat.addInputPath(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setInputFormatClass(TextInputFormat.class);
job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
if (sequenceFileOutputFormat) {
job.setOutputFormatClass(SequenceFileOutputFormat.class);
} else {
job.setOutputFormatClass(TextOutputFormat.class);
}
job.setMapOutputKeyClass(ContextPatternWritable.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setOutputKeyClass(ContextPatternWritable.class);
job.setOutputValueClass(DoubleWritable.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.waitForCompletion(true);
return 0;
}
use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.
the class ScoreContexts method run.
public int run() throws ClassNotFoundException, InterruptedException, IOException {
Configuration conf = getConf();
String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.InputPath", conf);
String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.OutputPath", conf);
String contextScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf);
String contextScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf);
sLogger.info("Tool name: ScoreContexts");
sLogger.info(" - Input path: " + inputPath);
sLogger.info(" - Output path: " + outputPath);
sLogger.info(" - Context scorer class: " + contextScorerClass);
sLogger.info(" - Context scorer args: " + contextScorerArgs);
Job job = new Job(conf);
job.setJobName("ScoreContexts");
job.setJarByClass(ScoreContexts.class);
FileInputFormat.addInputPath(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileOutputFormat.setCompressOutput(job, true);
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
job.setMapOutputKeyClass(ContextPatternWritable.class);
job.setMapOutputValueClass(ScoreWritable.class);
job.setOutputKeyClass(ContextPatternWritable.class);
job.setOutputValueClass(DoubleWritable.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.waitForCompletion(true);
return 0;
}
Aggregations