Search in sources :

Example 1 with PTableType

use of com.tdunning.plume.types.PTableType in project Plume by tdunning.

the class MapRedExecutor method getMapRed.

/**
   * This method returns a Job instance out of a {@link MSCR} entity. It puts the Class of 
   * the {@link PlumeWorkflow} argument and the MSCR id in the hadoop configuration.
   * 
   * @param mscr The MSCR to convert 
   * @param workflow The workflow whose class will be instantiated by hadoop mappers/reducers
   * @param outputPath The output path of the MapRed job
   * @return A hadoop-executable MapRed Job
   * 
   * @throws IOException
   */
static Job getMapRed(final MSCR mscr, PlumeWorkflow workFlow, String workFlowOutputPath, String outputPath) throws IOException {
    Configuration conf = new Configuration();
    conf.set(WORKFLOW_NAME, workFlow.getClass().getName());
    conf.setInt(MSCR_ID, mscr.getId());
    conf.set(TEMP_OUTPUT_PATH, workFlowOutputPath);
    // TODO deprecation
    Job job = new Job(conf, "MSCR");
    job.setMapOutputKeyClass(PlumeObject.class);
    job.setMapOutputValueClass(PlumeObject.class);
    job.setJarByClass(MapRedExecutor.class);
    /**
     * Define multiple inputs
     */
    for (PCollection<?> input : mscr.getInputs()) {
        if (!(input instanceof LazyCollection)) {
            throw new IllegalArgumentException("Can't create MapRed from MSCR whose inputs are not LazyTable");
        }
        LazyCollection<Text> l = (LazyCollection<Text>) input;
        if (!(l.isMaterialized() && l.getFile() != null)) {
            // Collections have plume ID only if they are intermediate results - TODO better naming for this
            if (l.getPlumeId().length() < 1) {
                throw new IllegalArgumentException("Can't create MapRed from MSCR inputs that are not materialized to a file");
            }
        }
        PCollectionType<?> rType = l.getType();
        Class<? extends InputFormat> format = SequenceFileInputFormat.class;
        if (rType instanceof PTableType) {
            PTableType<?, ?> tType = (PTableType<?, ?>) rType;
            if (tType.valueType() instanceof StringType && tType.keyType() instanceof StringType) {
                format = KeyValueTextInputFormat.class;
            }
            MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
        } else {
            if (rType.elementType() instanceof StringType) {
                format = TextInputFormat.class;
            }
            MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
        }
    }
    /**
     * Define multiple outputs
     */
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) {
        PCollectionType<?> rType = ((LazyCollection<?>) mscr.getOutputChannels().get(entry.getKey()).output).getType();
        if (rType instanceof PTableType) {
            PTableType<?, ?> tType = (PTableType<?, ?>) rType;
            Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
            if (tType.keyType() instanceof StringType && tType.valueType() instanceof StringType) {
                outputFormat = TextOutputFormat.class;
            }
            MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, getHadoopType(tType.keyType()), getHadoopType(tType.valueType()));
        } else {
            Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
            if (rType.elementType() instanceof StringType) {
                outputFormat = TextOutputFormat.class;
            }
            MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, NullWritable.class, getHadoopType(rType.elementType()));
        }
    }
    /**
     * Define Reducer & Combiner
     */
    job.setCombinerClass(MSCRCombiner.class);
    job.setReducerClass(MSCRReducer.class);
    job.setNumReduceTasks(1);
    return job;
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFileOutputFormat(org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat) Configuration(org.apache.hadoop.conf.Configuration) StringType(com.tdunning.plume.types.StringType) SequenceFileInputFormat(org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat) PTableType(com.tdunning.plume.types.PTableType) Text(org.apache.hadoop.io.Text) PCollection(com.tdunning.plume.PCollection) Job(org.apache.hadoop.mapreduce.Job) Map(java.util.Map)

Example 2 with PTableType

use of com.tdunning.plume.types.PTableType in project Plume by tdunning.

the class MSCRMapper method map.

@SuppressWarnings("unchecked")
protected void map(WritableComparable key, WritableComparable value, final Mapper<WritableComparable, WritableComparable, PlumeObject, PlumeObject>.Context<WritableComparable, WritableComparable, PlumeObject, PlumeObject> context) throws IOException, InterruptedException {
    LazyCollection<?> l = null;
    FileSplit fS = FileInputSplitWrapper.getFileInputSplit(context);
    // Get LazyCollection for this input (according to FileSplit)
    for (PCollection<?> input : mscr.getInputs()) {
        LazyCollection<?> thisL = (LazyCollection<?>) input;
        if (thisL.getFile() == null) {
            // Convention for intermediate results
            thisL.setFile(tmpFolder + "/" + thisL.getPlumeId());
        }
        if (fS.getPath().toString().startsWith(thisL.getFile()) || fS.getPath().toString().startsWith("file:" + thisL.getFile())) {
            l = thisL;
            break;
        }
    }
    if (l == null) {
        throw new RuntimeException("Unable to match input split with any MSCR input");
    }
    // If this collection is a table -> process Pair, otherwise process value
    PCollectionType type = l.getType();
    Object toProcess = value;
    if (type instanceof PTableType) {
        toProcess = Pair.create(key, value);
    }
    for (DeferredOp op : l.getDownOps()) {
        if (op instanceof MultipleParallelDo) {
            MultipleParallelDo mPDo = ((MultipleParallelDo) op);
            for (Object entry : mPDo.getDests().entrySet()) {
                Map.Entry<PCollection, DoFn> en = (Map.Entry<PCollection, DoFn>) entry;
                LazyCollection<?> lCol = (LazyCollection<?>) en.getKey();
                DeferredOp childOp = null;
                if (lCol.getDownOps() != null && lCol.getDownOps().size() > 0) {
                    childOp = lCol.getDownOps().get(0);
                }
                final Integer channel;
                if (childOp != null && childOp instanceof Flatten) {
                    channel = mscr.getNumberedChannels().get(((Flatten) childOp).getDest());
                } else if (childOp != null && childOp instanceof GroupByKey) {
                    channel = mscr.getNumberedChannels().get(((GroupByKey) childOp).getOrigin());
                } else {
                    // bypass channel?
                    channel = mscr.getNumberedChannels().get(en.getKey());
                }
                if (channel == null) {
                    // This is not for this MSCR - just skip it
                    return;
                }
                // Call parallelDo function
                en.getValue().process(toProcess, new EmitFn() {

                    @Override
                    public void emit(Object v) {
                        try {
                            if (v instanceof Pair) {
                                Pair p = (Pair) v;
                                context.write(new PlumeObject((WritableComparable) p.getKey(), channel), new PlumeObject((WritableComparable) p.getValue(), channel));
                            } else {
                                context.write(new PlumeObject((WritableComparable) v, channel), new PlumeObject((WritableComparable) v, channel));
                            }
                        } catch (Exception e) {
                            // TODO How to report this
                            e.printStackTrace();
                        }
                    }
                });
            }
        } else {
            if (op instanceof Flatten) {
                l = (LazyCollection) ((Flatten) op).getDest();
            }
            int channel = mscr.getNumberedChannels().get(l);
            if (toProcess instanceof Pair) {
                context.write(new PlumeObject(key, channel), new PlumeObject(value, channel));
            } else {
                context.write(new PlumeObject(value, channel), new PlumeObject(value, channel));
            }
        }
    }
}
Also used : MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) GroupByKey(com.tdunning.plume.local.lazy.op.GroupByKey) PlumeObject(com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject) PTableType(com.tdunning.plume.types.PTableType) Flatten(com.tdunning.plume.local.lazy.op.Flatten) PCollectionType(com.tdunning.plume.types.PCollectionType) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) IOException(java.io.IOException) PCollection(com.tdunning.plume.PCollection) DoFn(com.tdunning.plume.DoFn) EmitFn(com.tdunning.plume.EmitFn) PlumeObject(com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject) Map(java.util.Map) Pair(com.tdunning.plume.Pair)

Aggregations

PCollection (com.tdunning.plume.PCollection)2 PTableType (com.tdunning.plume.types.PTableType)2 Map (java.util.Map)2 DoFn (com.tdunning.plume.DoFn)1 EmitFn (com.tdunning.plume.EmitFn)1 Pair (com.tdunning.plume.Pair)1 PlumeObject (com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject)1 DeferredOp (com.tdunning.plume.local.lazy.op.DeferredOp)1 Flatten (com.tdunning.plume.local.lazy.op.Flatten)1 GroupByKey (com.tdunning.plume.local.lazy.op.GroupByKey)1 MultipleParallelDo (com.tdunning.plume.local.lazy.op.MultipleParallelDo)1 PCollectionType (com.tdunning.plume.types.PCollectionType)1 StringType (com.tdunning.plume.types.StringType)1 IOException (java.io.IOException)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 Text (org.apache.hadoop.io.Text)1 Job (org.apache.hadoop.mapreduce.Job)1 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)1 SequenceFileInputFormat (org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat)1