Search in sources :

Example 1 with StringType

use of com.tdunning.plume.types.StringType in project Plume by tdunning.

the class MapRedExecutor method getMapRed.

/**
   * This method returns a Job instance out of a {@link MSCR} entity. It puts the Class of 
   * the {@link PlumeWorkflow} argument and the MSCR id in the hadoop configuration.
   * 
   * @param mscr The MSCR to convert 
   * @param workflow The workflow whose class will be instantiated by hadoop mappers/reducers
   * @param outputPath The output path of the MapRed job
   * @return A hadoop-executable MapRed Job
   * 
   * @throws IOException
   */
static Job getMapRed(final MSCR mscr, PlumeWorkflow workFlow, String workFlowOutputPath, String outputPath) throws IOException {
    Configuration conf = new Configuration();
    conf.set(WORKFLOW_NAME, workFlow.getClass().getName());
    conf.setInt(MSCR_ID, mscr.getId());
    conf.set(TEMP_OUTPUT_PATH, workFlowOutputPath);
    // TODO deprecation
    Job job = new Job(conf, "MSCR");
    job.setMapOutputKeyClass(PlumeObject.class);
    job.setMapOutputValueClass(PlumeObject.class);
    job.setJarByClass(MapRedExecutor.class);
    /**
     * Define multiple inputs
     */
    for (PCollection<?> input : mscr.getInputs()) {
        if (!(input instanceof LazyCollection)) {
            throw new IllegalArgumentException("Can't create MapRed from MSCR whose inputs are not LazyTable");
        }
        LazyCollection<Text> l = (LazyCollection<Text>) input;
        if (!(l.isMaterialized() && l.getFile() != null)) {
            // Collections have plume ID only if they are intermediate results - TODO better naming for this
            if (l.getPlumeId().length() < 1) {
                throw new IllegalArgumentException("Can't create MapRed from MSCR inputs that are not materialized to a file");
            }
        }
        PCollectionType<?> rType = l.getType();
        Class<? extends InputFormat> format = SequenceFileInputFormat.class;
        if (rType instanceof PTableType) {
            PTableType<?, ?> tType = (PTableType<?, ?>) rType;
            if (tType.valueType() instanceof StringType && tType.keyType() instanceof StringType) {
                format = KeyValueTextInputFormat.class;
            }
            MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
        } else {
            if (rType.elementType() instanceof StringType) {
                format = TextInputFormat.class;
            }
            MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
        }
    }
    /**
     * Define multiple outputs
     */
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) {
        PCollectionType<?> rType = ((LazyCollection<?>) mscr.getOutputChannels().get(entry.getKey()).output).getType();
        if (rType instanceof PTableType) {
            PTableType<?, ?> tType = (PTableType<?, ?>) rType;
            Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
            if (tType.keyType() instanceof StringType && tType.valueType() instanceof StringType) {
                outputFormat = TextOutputFormat.class;
            }
            MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, getHadoopType(tType.keyType()), getHadoopType(tType.valueType()));
        } else {
            Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
            if (rType.elementType() instanceof StringType) {
                outputFormat = TextOutputFormat.class;
            }
            MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, NullWritable.class, getHadoopType(rType.elementType()));
        }
    }
    /**
     * Define Reducer & Combiner
     */
    job.setCombinerClass(MSCRCombiner.class);
    job.setReducerClass(MSCRReducer.class);
    job.setNumReduceTasks(1);
    return job;
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFileOutputFormat(org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat) Configuration(org.apache.hadoop.conf.Configuration) StringType(com.tdunning.plume.types.StringType) SequenceFileInputFormat(org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat) PTableType(com.tdunning.plume.types.PTableType) Text(org.apache.hadoop.io.Text) PCollection(com.tdunning.plume.PCollection) Job(org.apache.hadoop.mapreduce.Job) Map(java.util.Map)

Aggregations

PCollection (com.tdunning.plume.PCollection)1 PTableType (com.tdunning.plume.types.PTableType)1 StringType (com.tdunning.plume.types.StringType)1 Map (java.util.Map)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 Text (org.apache.hadoop.io.Text)1 Job (org.apache.hadoop.mapreduce.Job)1 SequenceFileInputFormat (org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat)1 SequenceFileOutputFormat (org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat)1