Search in sources :

Example 1 with IdentifierResolver

use of org.apache.hadoop.streaming.io.IdentifierResolver in project hadoop by apache.

the class StreamJob method setJobConf.

protected void setJobConf() throws IOException {
    if (additionalConfSpec_ != null) {
        LOG.warn("-additionalconfspec option is deprecated, please use -conf instead.");
        config_.addResource(new Path(additionalConfSpec_));
    }
    // general MapRed job properties
    jobConf_ = new JobConf(config_, StreamJob.class);
    // (mapreduce.job.working.dir will be lazily initialized ONCE and depends on FS)
    for (int i = 0; i < inputSpecs_.size(); i++) {
        FileInputFormat.addInputPaths(jobConf_, (String) inputSpecs_.get(i));
    }
    String defaultPackage = this.getClass().getPackage().getName();
    Class c;
    Class fmt = null;
    if (inReaderSpec_ == null && inputFormatSpec_ == null) {
        fmt = TextInputFormat.class;
    } else if (inputFormatSpec_ != null) {
        if (inputFormatSpec_.equals(TextInputFormat.class.getName()) || inputFormatSpec_.equals(TextInputFormat.class.getCanonicalName()) || inputFormatSpec_.equals(TextInputFormat.class.getSimpleName())) {
            fmt = TextInputFormat.class;
        } else if (inputFormatSpec_.equals(KeyValueTextInputFormat.class.getName()) || inputFormatSpec_.equals(KeyValueTextInputFormat.class.getCanonicalName()) || inputFormatSpec_.equals(KeyValueTextInputFormat.class.getSimpleName())) {
            if (inReaderSpec_ == null) {
                fmt = KeyValueTextInputFormat.class;
            }
        } else if (inputFormatSpec_.equals(SequenceFileInputFormat.class.getName()) || inputFormatSpec_.equals(org.apache.hadoop.mapred.SequenceFileInputFormat.class.getCanonicalName()) || inputFormatSpec_.equals(org.apache.hadoop.mapred.SequenceFileInputFormat.class.getSimpleName())) {
            if (inReaderSpec_ == null) {
                fmt = SequenceFileInputFormat.class;
            }
        } else if (inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class.getName()) || inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class.getCanonicalName()) || inputFormatSpec_.equals(SequenceFileAsTextInputFormat.class.getSimpleName())) {
            fmt = SequenceFileAsTextInputFormat.class;
        } else {
            c = StreamUtil.goodClassOrNull(jobConf_, inputFormatSpec_, defaultPackage);
            if (c != null) {
                fmt = c;
            } else {
                fail("-inputformat : class not found : " + inputFormatSpec_);
            }
        }
    }
    if (fmt == null) {
        fmt = StreamInputFormat.class;
    }
    jobConf_.setInputFormat(fmt);
    if (ioSpec_ != null) {
        jobConf_.set("stream.map.input", ioSpec_);
        jobConf_.set("stream.map.output", ioSpec_);
        jobConf_.set("stream.reduce.input", ioSpec_);
        jobConf_.set("stream.reduce.output", ioSpec_);
    }
    Class<? extends IdentifierResolver> idResolverClass = jobConf_.getClass("stream.io.identifier.resolver.class", IdentifierResolver.class, IdentifierResolver.class);
    IdentifierResolver idResolver = ReflectionUtils.newInstance(idResolverClass, jobConf_);
    idResolver.resolve(jobConf_.get("stream.map.input", IdentifierResolver.TEXT_ID));
    jobConf_.setClass("stream.map.input.writer.class", idResolver.getInputWriterClass(), InputWriter.class);
    idResolver.resolve(jobConf_.get("stream.reduce.input", IdentifierResolver.TEXT_ID));
    jobConf_.setClass("stream.reduce.input.writer.class", idResolver.getInputWriterClass(), InputWriter.class);
    jobConf_.set("stream.addenvironment", addTaskEnvironment_);
    boolean isMapperACommand = false;
    if (mapCmd_ != null) {
        c = StreamUtil.goodClassOrNull(jobConf_, mapCmd_, defaultPackage);
        if (c != null) {
            jobConf_.setMapperClass(c);
        } else {
            isMapperACommand = true;
            jobConf_.setMapperClass(PipeMapper.class);
            jobConf_.setMapRunnerClass(PipeMapRunner.class);
            jobConf_.set("stream.map.streamprocessor", URLEncoder.encode(mapCmd_, "UTF-8"));
        }
    }
    if (comCmd_ != null) {
        c = StreamUtil.goodClassOrNull(jobConf_, comCmd_, defaultPackage);
        if (c != null) {
            jobConf_.setCombinerClass(c);
        } else {
            jobConf_.setCombinerClass(PipeCombiner.class);
            jobConf_.set("stream.combine.streamprocessor", URLEncoder.encode(comCmd_, "UTF-8"));
        }
    }
    if (numReduceTasksSpec_ != null) {
        int numReduceTasks = Integer.parseInt(numReduceTasksSpec_);
        jobConf_.setNumReduceTasks(numReduceTasks);
    }
    boolean isReducerACommand = false;
    if (redCmd_ != null) {
        if (redCmd_.equals(REDUCE_NONE)) {
            jobConf_.setNumReduceTasks(0);
        }
        if (jobConf_.getNumReduceTasks() != 0) {
            if (redCmd_.compareToIgnoreCase("aggregate") == 0) {
                jobConf_.setReducerClass(ValueAggregatorReducer.class);
                jobConf_.setCombinerClass(ValueAggregatorCombiner.class);
            } else {
                c = StreamUtil.goodClassOrNull(jobConf_, redCmd_, defaultPackage);
                if (c != null) {
                    jobConf_.setReducerClass(c);
                } else {
                    isReducerACommand = true;
                    jobConf_.setReducerClass(PipeReducer.class);
                    jobConf_.set("stream.reduce.streamprocessor", URLEncoder.encode(redCmd_, "UTF-8"));
                }
            }
        }
    }
    idResolver.resolve(jobConf_.get("stream.map.output", IdentifierResolver.TEXT_ID));
    jobConf_.setClass("stream.map.output.reader.class", idResolver.getOutputReaderClass(), OutputReader.class);
    if (isMapperACommand || jobConf_.get("stream.map.output") != null) {
        // if mapper is a command, then map output key/value classes come from the
        // idResolver
        jobConf_.setMapOutputKeyClass(idResolver.getOutputKeyClass());
        jobConf_.setMapOutputValueClass(idResolver.getOutputValueClass());
        if (jobConf_.getNumReduceTasks() == 0) {
            jobConf_.setOutputKeyClass(idResolver.getOutputKeyClass());
            jobConf_.setOutputValueClass(idResolver.getOutputValueClass());
        }
    }
    idResolver.resolve(jobConf_.get("stream.reduce.output", IdentifierResolver.TEXT_ID));
    jobConf_.setClass("stream.reduce.output.reader.class", idResolver.getOutputReaderClass(), OutputReader.class);
    if (isReducerACommand || jobConf_.get("stream.reduce.output") != null) {
        // if reducer is a command, then output key/value classes come from the
        // idResolver
        jobConf_.setOutputKeyClass(idResolver.getOutputKeyClass());
        jobConf_.setOutputValueClass(idResolver.getOutputValueClass());
    }
    if (inReaderSpec_ != null) {
        String[] args = inReaderSpec_.split(",");
        String readerClass = args[0];
        // this argument can only be a Java class
        c = StreamUtil.goodClassOrNull(jobConf_, readerClass, defaultPackage);
        if (c != null) {
            jobConf_.set("stream.recordreader.class", c.getName());
        } else {
            fail("-inputreader: class not found: " + readerClass);
        }
        for (int i = 1; i < args.length; i++) {
            String[] nv = args[i].split("=", 2);
            String k = "stream.recordreader." + nv[0];
            String v = (nv.length > 1) ? nv[1] : "";
            jobConf_.set(k, v);
        }
    }
    FileOutputFormat.setOutputPath(jobConf_, new Path(output_));
    fmt = null;
    if (outputFormatSpec_ != null) {
        c = StreamUtil.goodClassOrNull(jobConf_, outputFormatSpec_, defaultPackage);
        if (c != null) {
            fmt = c;
        } else {
            fail("-outputformat : class not found : " + outputFormatSpec_);
        }
    }
    if (fmt == null) {
        fmt = TextOutputFormat.class;
    }
    if (lazyOutput_) {
        LazyOutputFormat.setOutputFormatClass(jobConf_, fmt);
    } else {
        jobConf_.setOutputFormat(fmt);
    }
    if (partitionerSpec_ != null) {
        c = StreamUtil.goodClassOrNull(jobConf_, partitionerSpec_, defaultPackage);
        if (c != null) {
            jobConf_.setPartitionerClass(c);
        } else {
            fail("-partitioner : class not found : " + partitionerSpec_);
        }
    }
    if (mapDebugSpec_ != null) {
        jobConf_.setMapDebugScript(mapDebugSpec_);
    }
    if (reduceDebugSpec_ != null) {
        jobConf_.setReduceDebugScript(reduceDebugSpec_);
    }
    // last, allow user to override anything
    // (although typically used with properties we didn't touch)
    jar_ = packageJobJar();
    if (jar_ != null) {
        jobConf_.setJar(jar_);
    }
    if ((cacheArchives != null) || (cacheFiles != null)) {
        getURIs(cacheArchives, cacheFiles);
        boolean b = DistributedCache.checkURIs(fileURIs, archiveURIs);
        if (!b)
            fail(LINK_URI);
    }
    // set the jobconf for the caching parameters
    if (cacheArchives != null)
        DistributedCache.setCacheArchives(archiveURIs, jobConf_);
    if (cacheFiles != null)
        DistributedCache.setCacheFiles(fileURIs, jobConf_);
    if (verbose_) {
        listJobConfProperties();
    }
    msg("submitting to jobconf: " + getJobTrackerHostPort());
}
Also used : Path(org.apache.hadoop.fs.Path) IdentifierResolver(org.apache.hadoop.streaming.io.IdentifierResolver) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) KeyValueTextInputFormat(org.apache.hadoop.mapred.KeyValueTextInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) SequenceFileAsTextInputFormat(org.apache.hadoop.mapred.SequenceFileAsTextInputFormat) SequenceFileAsTextInputFormat(org.apache.hadoop.mapred.SequenceFileAsTextInputFormat) KeyValueTextInputFormat(org.apache.hadoop.mapred.KeyValueTextInputFormat) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

Path (org.apache.hadoop.fs.Path)1 JobConf (org.apache.hadoop.mapred.JobConf)1 KeyValueTextInputFormat (org.apache.hadoop.mapred.KeyValueTextInputFormat)1 SequenceFileAsTextInputFormat (org.apache.hadoop.mapred.SequenceFileAsTextInputFormat)1 SequenceFileInputFormat (org.apache.hadoop.mapred.SequenceFileInputFormat)1 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)1 IdentifierResolver (org.apache.hadoop.streaming.io.IdentifierResolver)1