Search in sources :

Example 11 with Job

use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.

the class CombineSplits method run.

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();
    String examplesPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExamplesPath", conf);
    String exampleStatsPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExampleStatsPath", conf);
    String splitKey = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.SplitKey", conf).toLowerCase();
    int numSplits = conf.getInt("Mavuno.CombineSplits.TotalSplits", 1);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.OutputPath", conf);
    sLogger.info("Tool name: CombineSplits");
    sLogger.info(" - Examples path: " + examplesPath);
    sLogger.info(" - Example stats path: " + exampleStatsPath);
    sLogger.info(" - Split key: " + splitKey);
    sLogger.info(" - Total splits: " + numSplits);
    sLogger.info(" - Output path: " + outputPath);
    Job job = new Job(conf);
    job.setJobName("CombineSplits");
    job.setJarByClass(CombineSplits.class);
    for (int split = 0; split < numSplits; split++) {
        FileInputFormat.addInputPath(job, new Path(examplesPath + "/" + split));
    }
    if (MavunoUtils.pathExists(conf, exampleStatsPath)) {
        FileInputFormat.addInputPath(job, new Path(exampleStatsPath));
    }
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    job.setMapOutputKeyClass(ContextPatternWritable.class);
    if ("pattern".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    } else if ("context".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
    } else if ("pattern+context".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    } else {
        throw new RuntimeException("Invalid SplitKey in CombineSplits! -- " + splitKey);
    }
    job.setMapOutputValueClass(ContextPatternStatsWritable.class);
    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.waitForCompletion(true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) ContextPatternWritable(edu.isi.mavuno.util.ContextPatternWritable) Configuration(org.apache.hadoop.conf.Configuration) Job(org.apache.hadoop.mapreduce.Job)

Example 12 with Job

use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.

the class ExtractGlobalStats method run.

@SuppressWarnings("unchecked")
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();
    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf);
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf);
    String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf).toLowerCase();
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf);
    // split examples
    conf.set("Mavuno.Split.InputPath", inputPath);
    conf.set("Mavuno.Split.OutputPath", outputPath + "/../split");
    conf.set("Mavuno.Split.SplitKey", extractorTarget);
    new Split(conf).run();
    // get splits
    FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split");
    int split = 0;
    for (FileStatus file : files) {
        if (!file.getPath().getName().endsWith(".examples")) {
            continue;
        }
        conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString());
        sLogger.info("Tool name: ExtractGlobalStats");
        sLogger.info(" - Input path: " + inputPath);
        sLogger.info(" - Examples path: " + file.getPath());
        sLogger.info(" - Example split: " + split);
        sLogger.info(" - Corpus path: " + corpusPath);
        sLogger.info(" - Corpus class: " + corpusClass);
        sLogger.info(" - Extractor class: " + extractorClass);
        sLogger.info(" - Extractor class: " + extractorArgs);
        sLogger.info(" - Extractor target: " + extractorTarget);
        sLogger.info(" - Output path: " + outputPath);
        Job job = new Job(conf);
        job.setJobName("ExtractGlobalStats");
        job.setJarByClass(ExtractGlobalStats.class);
        MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split));
        job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
        job.setMapOutputKeyClass(ContextPatternWritable.class);
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
        job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
        job.setMapOutputValueClass(ContextPatternStatsWritable.class);
        job.setOutputKeyClass(ContextPatternWritable.class);
        job.setOutputValueClass(ContextPatternStatsWritable.class);
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        job.waitForCompletion(true);
        split++;
    }
    // combine splits
    conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split);
    conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/");
    conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath);
    new CombineGlobalStats(conf).run();
    MavunoUtils.removeDirectory(conf, outputPath + "/../split");
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) ContextPatternWritable(edu.isi.mavuno.util.ContextPatternWritable) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) Split(edu.isi.mavuno.extract.Split) Job(org.apache.hadoop.mapreduce.Job) CombineGlobalStats(edu.isi.mavuno.extract.CombineGlobalStats)

Example 13 with Job

use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.

the class HarvestUDAPInstances method run.

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.OutputPath", conf);
    sLogger.info("Tool name: HarvestUDAPInstances");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);
    Job job = new Job(conf);
    job.setJobName("HarvestUDAPInstances");
    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.waitForCompletion(true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Job(org.apache.hadoop.mapreduce.Job)

Example 14 with Job

use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.

the class GetTopResults method run.

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();
    String inputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.OutputPath", conf);
    int numResults = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.GetTopResults.NumResults", conf));
    boolean sequenceFileOutputFormat = conf.getBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false);
    sLogger.info("Tool name: GetTopResults");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Number of results: " + numResults);
    sLogger.info(" - Output path: " + outputPath);
    Job job = new Job(conf);
    job.setJobName("GetTopResults");
    job.setJarByClass(GetTopResults.class);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass(TextInputFormat.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
    if (sequenceFileOutputFormat) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else {
        job.setOutputFormatClass(TextOutputFormat.class);
    }
    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);
    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.waitForCompletion(true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) ContextPatternWritable(edu.isi.mavuno.util.ContextPatternWritable) Configuration(org.apache.hadoop.conf.Configuration) Job(org.apache.hadoop.mapreduce.Job)

Example 15 with Job

use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.

the class ScoreContexts method run.

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();
    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.OutputPath", conf);
    String contextScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf);
    String contextScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf);
    sLogger.info("Tool name: ScoreContexts");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Context scorer class: " + contextScorerClass);
    sLogger.info(" - Context scorer args: " + contextScorerArgs);
    Job job = new Job(conf);
    job.setJobName("ScoreContexts");
    job.setJarByClass(ScoreContexts.class);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(ScoreWritable.class);
    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.waitForCompletion(true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) ContextPatternWritable(edu.isi.mavuno.util.ContextPatternWritable) Configuration(org.apache.hadoop.conf.Configuration) Job(org.apache.hadoop.mapreduce.Job)

Aggregations

Job (org.apache.hadoop.mapreduce.Job)886 Path (org.apache.hadoop.fs.Path)498 Configuration (org.apache.hadoop.conf.Configuration)434 Test (org.junit.Test)259 IOException (java.io.IOException)135 FileSystem (org.apache.hadoop.fs.FileSystem)128 File (java.io.File)77 InputSplit (org.apache.hadoop.mapreduce.InputSplit)58 ArrayList (java.util.ArrayList)55 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)55 Scan (org.apache.hadoop.hbase.client.Scan)45 FileStatus (org.apache.hadoop.fs.FileStatus)44 NutchJob (org.apache.nutch.util.NutchJob)43 JobConf (org.apache.hadoop.mapred.JobConf)42 Text (org.apache.hadoop.io.Text)39 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)36 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)35 JobContext (org.apache.hadoop.mapreduce.JobContext)35 GenericOptionsParser (org.apache.hadoop.util.GenericOptionsParser)35 CommandLine (org.apache.commons.cli.CommandLine)33