Search in sources :

Example 1 with Partitioner

use of org.apache.hadoop.mapred.Partitioner in project hadoop by apache.

the class Submitter method run.

@Override
public int run(String[] args) throws Exception {
    CommandLineParser cli = new CommandLineParser();
    if (args.length == 0) {
        cli.printUsage();
        return 1;
    }
    cli.addOption("input", false, "input path to the maps", "path");
    cli.addOption("output", false, "output path from the reduces", "path");
    cli.addOption("jar", false, "job jar file", "path");
    cli.addOption("inputformat", false, "java classname of InputFormat", "class");
    //cli.addArgument("javareader", false, "is the RecordReader in Java");
    cli.addOption("map", false, "java classname of Mapper", "class");
    cli.addOption("partitioner", false, "java classname of Partitioner", "class");
    cli.addOption("reduce", false, "java classname of Reducer", "class");
    cli.addOption("writer", false, "java classname of OutputFormat", "class");
    cli.addOption("program", false, "URI to application executable", "class");
    cli.addOption("reduces", false, "number of reduces", "num");
    cli.addOption("jobconf", false, "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val");
    cli.addOption("lazyOutput", false, "Optional. Create output lazily", "boolean");
    Parser parser = cli.createParser();
    try {
        GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args);
        CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs());
        JobConf job = new JobConf(getConf());
        if (results.hasOption("input")) {
            FileInputFormat.setInputPaths(job, results.getOptionValue("input"));
        }
        if (results.hasOption("output")) {
            FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output")));
        }
        if (results.hasOption("jar")) {
            job.setJar(results.getOptionValue("jar"));
        }
        if (results.hasOption("inputformat")) {
            setIsJavaRecordReader(job, true);
            job.setInputFormat(getClass(results, "inputformat", job, InputFormat.class));
        }
        if (results.hasOption("javareader")) {
            setIsJavaRecordReader(job, true);
        }
        if (results.hasOption("map")) {
            setIsJavaMapper(job, true);
            job.setMapperClass(getClass(results, "map", job, Mapper.class));
        }
        if (results.hasOption("partitioner")) {
            job.setPartitionerClass(getClass(results, "partitioner", job, Partitioner.class));
        }
        if (results.hasOption("reduce")) {
            setIsJavaReducer(job, true);
            job.setReducerClass(getClass(results, "reduce", job, Reducer.class));
        }
        if (results.hasOption("reduces")) {
            job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces")));
        }
        if (results.hasOption("writer")) {
            setIsJavaRecordWriter(job, true);
            job.setOutputFormat(getClass(results, "writer", job, OutputFormat.class));
        }
        if (results.hasOption("lazyOutput")) {
            if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
                LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormat().getClass());
            }
        }
        if (results.hasOption("program")) {
            setExecutable(job, results.getOptionValue("program"));
        }
        if (results.hasOption("jobconf")) {
            LOG.warn("-jobconf option is deprecated, please use -D instead.");
            String options = results.getOptionValue("jobconf");
            StringTokenizer tokenizer = new StringTokenizer(options, ",");
            while (tokenizer.hasMoreTokens()) {
                String keyVal = tokenizer.nextToken().trim();
                String[] keyValSplit = keyVal.split("=");
                job.set(keyValSplit[0], keyValSplit[1]);
            }
        }
        // if they gave us a jar file, include it into the class path
        String jarFile = job.getJar();
        if (jarFile != null) {
            final URL[] urls = new URL[] { FileSystem.getLocal(job).pathToFile(new Path(jarFile)).toURL() };
            //FindBugs complains that creating a URLClassLoader should be
            //in a doPrivileged() block. 
            ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {

                public ClassLoader run() {
                    return new URLClassLoader(urls);
                }
            });
            job.setClassLoader(loader);
        }
        runJob(job);
        return 0;
    } catch (ParseException pe) {
        LOG.info("Error : " + pe);
        cli.printUsage();
        return 1;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) NullOutputFormat(org.apache.hadoop.mapred.lib.NullOutputFormat) OutputFormat(org.apache.hadoop.mapred.OutputFormat) LazyOutputFormat(org.apache.hadoop.mapred.lib.LazyOutputFormat) FileOutputFormat(org.apache.hadoop.mapred.FileOutputFormat) URL(java.net.URL) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser) BasicParser(org.apache.commons.cli.BasicParser) Parser(org.apache.commons.cli.Parser) Mapper(org.apache.hadoop.mapred.Mapper) CommandLine(org.apache.commons.cli.CommandLine) StringTokenizer(java.util.StringTokenizer) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) URLClassLoader(java.net.URLClassLoader) URLClassLoader(java.net.URLClassLoader) ParseException(org.apache.commons.cli.ParseException) Reducer(org.apache.hadoop.mapred.Reducer) JobConf(org.apache.hadoop.mapred.JobConf) HashPartitioner(org.apache.hadoop.mapred.lib.HashPartitioner) Partitioner(org.apache.hadoop.mapred.Partitioner) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 2 with Partitioner

use of org.apache.hadoop.mapred.Partitioner in project Cloud9 by lintool.

the class HubsAndAuthoritiesSchimmy method HACalc.

public int HACalc(String path, int iter, int jter, int nodeCount, boolean useCombiner, boolean useInmapCombiner, boolean useRange, int mapTasks, int reduceTasks) throws IOException {
    JobConf conf = new JobConf(HubsAndAuthoritiesSchimmy.class);
    String inputPath = path + "/iter" + sFormat.format(iter);
    String outputPath = path + "/iter" + sFormat.format(jter) + "t";
    FileSystem fs = FileSystem.get(conf);
    // int numPartitions = FileSystem.get(conf).listStatus(new
    // Path(inputPath)).length - 1;
    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(inputPath))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;
    }
    conf.setInt("NodeCount", nodeCount);
    Partitioner p = null;
    if (useRange) {
        p = new RangePartitioner<IntWritable, Writable>();
        p.configure(conf);
    } else {
        p = new HashPartitioner<WritableComparable, Writable>();
    }
    // this is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    HITSNode value = new HITSNode();
    FileStatus[] status = fs.listStatus(new Path(inputPath));
    StringBuilder sb = new StringBuilder();
    for (FileStatus f : status) {
        if (f.getPath().getName().contains("_logs"))
            continue;
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf);
        reader.next(key, value);
        @SuppressWarnings("unchecked") int np = p.getPartition(key, value, numPartitions);
        reader.close();
        sLogger.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + "\t");
    }
    sLogger.info(sb.toString().trim());
    sLogger.info("Tool: HubsAndAuthorities");
    sLogger.info(" - iteration: " + iter);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);
    conf.setJobName("Iter" + iter + "HubsAndAuthorities");
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HITSNode.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    if (useInmapCombiner == true) {
        conf.setMapperClass(HAMapperIMC.class);
    } else {
        conf.setMapperClass(HAMapper.class);
    }
    if (useRange == true) {
        conf.setPartitionerClass(RangePartitioner.class);
    }
    conf.setReducerClass(HAReducer.class);
    conf.setInt("jobIter", iter);
    conf.setInt("NodeCount", nodeCount);
    conf.set("PartitionMapping", sb.toString().trim());
    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);
    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Writable(org.apache.hadoop.io.Writable) ArrayListOfIntsWritable(tl.lin.data.array.ArrayListOfIntsWritable) IntWritable(org.apache.hadoop.io.IntWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) WritableComparable(org.apache.hadoop.io.WritableComparable) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) HashPartitioner(org.apache.hadoop.mapred.lib.HashPartitioner) Partitioner(org.apache.hadoop.mapred.Partitioner) IntWritable(org.apache.hadoop.io.IntWritable)

Aggregations

Path (org.apache.hadoop.fs.Path)2 JobConf (org.apache.hadoop.mapred.JobConf)2 Partitioner (org.apache.hadoop.mapred.Partitioner)2 HashPartitioner (org.apache.hadoop.mapred.lib.HashPartitioner)2 URL (java.net.URL)1 URLClassLoader (java.net.URLClassLoader)1 StringTokenizer (java.util.StringTokenizer)1 BasicParser (org.apache.commons.cli.BasicParser)1 CommandLine (org.apache.commons.cli.CommandLine)1 ParseException (org.apache.commons.cli.ParseException)1 Parser (org.apache.commons.cli.Parser)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 FloatWritable (org.apache.hadoop.io.FloatWritable)1 IntWritable (org.apache.hadoop.io.IntWritable)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 Writable (org.apache.hadoop.io.Writable)1 WritableComparable (org.apache.hadoop.io.WritableComparable)1 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)1 FileOutputFormat (org.apache.hadoop.mapred.FileOutputFormat)1