Examples with PCollection - com.tdunning.plume.PCollection

Example 1 with PCollection

use of com.tdunning.plume.PCollection in project Plume by tdunning.

the class MapRedExecutor method getMapRed.

/**
   * This method returns a Job instance out of a {@link MSCR} entity. It puts the Class of 
   * the {@link PlumeWorkflow} argument and the MSCR id in the hadoop configuration.
   * 
   * @param mscr The MSCR to convert 
   * @param workflow The workflow whose class will be instantiated by hadoop mappers/reducers
   * @param outputPath The output path of the MapRed job
   * @return A hadoop-executable MapRed Job
   * 
   * @throws IOException
   */
static Job getMapRed(final MSCR mscr, PlumeWorkflow workFlow, String workFlowOutputPath, String outputPath) throws IOException {
    Configuration conf = new Configuration();
    conf.set(WORKFLOW_NAME, workFlow.getClass().getName());
    conf.setInt(MSCR_ID, mscr.getId());
    conf.set(TEMP_OUTPUT_PATH, workFlowOutputPath);
    // TODO deprecation
    Job job = new Job(conf, "MSCR");
    job.setMapOutputKeyClass(PlumeObject.class);
    job.setMapOutputValueClass(PlumeObject.class);
    job.setJarByClass(MapRedExecutor.class);
    /**
     * Define multiple inputs
     */
    for (PCollection<?> input : mscr.getInputs()) {
        if (!(input instanceof LazyCollection)) {
            throw new IllegalArgumentException("Can't create MapRed from MSCR whose inputs are not LazyTable");
        }
        LazyCollection<Text> l = (LazyCollection<Text>) input;
        if (!(l.isMaterialized() && l.getFile() != null)) {
            // Collections have plume ID only if they are intermediate results - TODO better naming for this
            if (l.getPlumeId().length() < 1) {
                throw new IllegalArgumentException("Can't create MapRed from MSCR inputs that are not materialized to a file");
            }
        }
        PCollectionType<?> rType = l.getType();
        Class<? extends InputFormat> format = SequenceFileInputFormat.class;
        if (rType instanceof PTableType) {
            PTableType<?, ?> tType = (PTableType<?, ?>) rType;
            if (tType.valueType() instanceof StringType && tType.keyType() instanceof StringType) {
                format = KeyValueTextInputFormat.class;
            }
            MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
        } else {
            if (rType.elementType() instanceof StringType) {
                format = TextInputFormat.class;
            }
            MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
        }
    }
    /**
     * Define multiple outputs
     */
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) {
        PCollectionType<?> rType = ((LazyCollection<?>) mscr.getOutputChannels().get(entry.getKey()).output).getType();
        if (rType instanceof PTableType) {
            PTableType<?, ?> tType = (PTableType<?, ?>) rType;
            Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
            if (tType.keyType() instanceof StringType && tType.valueType() instanceof StringType) {
                outputFormat = TextOutputFormat.class;
            }
            MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, getHadoopType(tType.keyType()), getHadoopType(tType.valueType()));
        } else {
            Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
            if (rType.elementType() instanceof StringType) {
                outputFormat = TextOutputFormat.class;
            }
            MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, NullWritable.class, getHadoopType(rType.elementType()));
        }
    }
    /**
     * Define Reducer & Combiner
     */
    job.setCombinerClass(MSCRCombiner.class);
    job.setReducerClass(MSCRReducer.class);
    job.setNumReduceTasks(1);
    return job;
}

Also used : Path(org.apache.hadoop.fs.Path) SequenceFileOutputFormat(org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat) Configuration(org.apache.hadoop.conf.Configuration) StringType(com.tdunning.plume.types.StringType) SequenceFileInputFormat(org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat) PTableType(com.tdunning.plume.types.PTableType) Text(org.apache.hadoop.io.Text) PCollection(com.tdunning.plume.PCollection) Job(org.apache.hadoop.mapreduce.Job) Map(java.util.Map)

Example 2 with PCollection

use of com.tdunning.plume.PCollection in project Plume by tdunning.

the class Optimizer method removeUnnecessaryOps.

/**
   * Removes unnecesary operations that are not removed by the Optimizer. It goes top-down (receives an Input).
   * Returns true if passed node doesn't lead to an output.
   */
boolean removeUnnecessaryOps(PCollection arg, List<PCollection> outputs) {
    LazyCollection<?> input = (LazyCollection) arg;
    if (input.getDownOps() == null || input.getDownOps().size() == 0) {
        // Leaf node
        return !outputs.contains(input);
    }
    // create new list of deferredops that are 'usefull'
    List<DeferredOp> finalDOps = new ArrayList<DeferredOp>();
    for (DeferredOp op : input.getDownOps()) {
        boolean remove = false;
        if (op instanceof OneToOneOp) {
            remove = removeUnnecessaryOps(((OneToOneOp<?, ?>) op).getDest(), outputs);
        } else if (op instanceof ParallelDo) {
            remove = removeUnnecessaryOps(((ParallelDo<?, ?>) op).getDest(), outputs);
        } else if (op instanceof Flatten) {
            remove = removeUnnecessaryOps(((Flatten<?>) op).getDest(), outputs);
        } else if (op instanceof MultipleParallelDo) {
            MultipleParallelDo<?> mPDo = (MultipleParallelDo<?>) op;
            // begin with 1 because we will apply an AND gate with the childs
            remove = true;
            for (Object entry : mPDo.getDests().entrySet()) {
                PCollection<?> pCol = (PCollection<?>) ((Map.Entry) entry).getKey();
                remove = remove & removeUnnecessaryOps(pCol, outputs);
            }
        }
        if (!remove) {
            finalDOps.add(op);
        }
    }
    input.downOps = finalDOps;
    // if true this node can also be removed
    return finalDOps.size() == 0;
}

Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) ArrayList(java.util.ArrayList) Flatten(com.tdunning.plume.local.lazy.op.Flatten) OneToOneOp(com.tdunning.plume.local.lazy.op.OneToOneOp) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) PCollection(com.tdunning.plume.PCollection)

Example 3 with PCollection

use of com.tdunning.plume.PCollection in project Plume by tdunning.

the class Optimizer method sinkFlattens.

/**
   * Sink flattens pushing them down to create opportunities for ParallelDo fusion
   * @param arg  The collection that may contain flatten operations that we need to sink.
   */
@SuppressWarnings({ "unchecked", "rawtypes" })
<T> void sinkFlattens(PCollection<T> arg) {
    LazyCollection<T> output = (LazyCollection<T>) arg;
    if (output.isMaterialized()) {
        // stop condition for recursive algorithm
        return;
    }
    DeferredOp dOp = output.getDeferredOp();
    if (!(dOp instanceof Flatten)) {
        if (dOp instanceof OneToOneOp) {
            // Recursively apply this function to parent
            sinkFlattens(((OneToOneOp) dOp).getOrigin());
            return;
        } else if (dOp instanceof ParallelDo) {
            // Recursively apply this function to parent
            sinkFlattens(((ParallelDo) dOp).getOrigin());
            return;
        }
    }
    if (output.getDownOps() == null || output.getDownOps().size() != 1) {
        // Recursively apply this function to parent
        for (Object col : ((Flatten) dOp).getOrigins()) {
            sinkFlattens((PCollection) col);
        }
        return;
    }
    DeferredOp downOp = output.getDownOps().get(0);
    if (!(downOp instanceof ParallelDo)) {
        return;
    }
    // PDo below current node
    ParallelDo<T, ?> op = (ParallelDo<T, ?>) downOp;
    // Flatten above current node
    Flatten<T> flatten = (Flatten<T>) dOp;
    List<PCollection<?>> newOrigins = new ArrayList<PCollection<?>>();
    // Iterate over all Flatten inputs
    for (PCollection<T> col : flatten.getOrigins()) {
        // Recursively apply this function to this flatten's origin
        LazyCollection<T> fInput = (LazyCollection<T>) col;
        sinkFlattens(fInput);
        // Sink 
        LazyCollection<?> newInput = new LazyCollection();
        newInput.deferredOp = new ParallelDo(op.getFunction(), fInput, newInput);
        newInput.type = ((LazyCollection) flatten.getDest()).getType();
        fInput.downOps.remove(0);
        fInput.addDownOp(newInput.deferredOp);
        newOrigins.add(newInput);
    }
    Flatten<?> newFlatten = new Flatten(newOrigins, op.getDest());
    ((LazyCollection<?>) op.getDest()).deferredOp = newFlatten;
    for (PCollection<?> newOp : newOrigins) {
        ((LazyCollection<?>) newOp).addDownOp(newFlatten);
    }
}

Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) Flatten(com.tdunning.plume.local.lazy.op.Flatten) ArrayList(java.util.ArrayList) OneToOneOp(com.tdunning.plume.local.lazy.op.OneToOneOp) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) PCollection(com.tdunning.plume.PCollection)

Example 4 with PCollection

use of com.tdunning.plume.PCollection in project Plume by tdunning.

the class Optimizer method optimize.

/**
   * Optimizes an execution tree
   * 
   * @param inputs   A list of the inputs.
   * @param outputs  A list of the outputs.
   * @return  An optimized dataflow that consists of MSCR operations decorated with functional
   * compositions.
   */
@SuppressWarnings({ "unchecked", "rawtypes" })
public ExecutionStep optimize(List<PCollection> inputs, List<PCollection> outputs) {
    if (outputs == null || outputs.size() == 0) {
        throw new IllegalArgumentException("Empty output list");
    }
    if (inputs == null || inputs.size() == 0) {
        throw new IllegalArgumentException("Empty input list");
    }
    for (PCollection output : outputs) {
        sinkFlattens(output);
    }
    for (PCollection output : outputs) {
        fuseParallelDos(output);
    }
    for (PCollection output : outputs) {
        fuseSiblingParallelDos(output);
    }
    // Clean optimized tree
    for (PCollection input : inputs) {
        removeUnnecessaryOps(input, outputs);
    }
    Set<MSCR> mscrs = OptimizerTools.getMSCRBlocks(outputs);
    // Build a map of output -> MSCR step
    Map<PCollection<?>, MSCR> outputMap = new HashMap<PCollection<?>, MSCR>();
    for (MSCR mscr : mscrs) {
        for (Map.Entry<PCollection<?>, MSCR.OutputChannel<?, ?, ?>> entry : mscr.getOutputChannels().entrySet()) {
            MSCR.OutputChannel<?, ?, ?> oC = entry.getValue();
            outputMap.put(oC.output, mscr);
        }
    }
    // Calculate dependencies between MSCRs
    Map<MSCR, Set<MSCR>> dependencyMap = new HashMap<MSCR, Set<MSCR>>();
    Set<MSCR> beginningMscrs = new HashSet<MSCR>();
    for (MSCR mscr : mscrs) {
        for (PCollection<?> input : mscr.getInputs()) {
            if (inputs.contains(input)) {
                beginningMscrs.add(mscr);
            }
            MSCR dependsOn = outputMap.get(input);
            if (dependsOn == null) {
                continue;
            }
            Set<MSCR> dependencies = dependencyMap.get(mscr);
            if (dependencies == null) {
                dependencies = new HashSet<MSCR>();
            }
            dependencies.add(dependsOn);
            dependencyMap.put(mscr, dependencies);
        }
    }
    ExecutionStep firstStep = new ExecutionStep();
    for (MSCR step : beginningMscrs) {
        if (dependencyMap.get(step) == null) {
            firstStep.mscrSteps.add(step);
        }
    }
    // Calculate execution plan
    Set<MSCR> solvedSteps = new HashSet<MSCR>();
    solvedSteps.addAll(firstStep.mscrSteps);
    ExecutionStep previousStep = firstStep;
    while (!solvedSteps.containsAll(mscrs)) {
        ExecutionStep nextStep = new ExecutionStep();
        for (MSCR mscr : mscrs) {
            if (solvedSteps.contains(mscr)) {
                continue;
            }
            Set<MSCR> dependencies = dependencyMap.get(mscr);
            if (solvedSteps.containsAll(dependencies)) {
                nextStep.mscrSteps.add(mscr);
                solvedSteps.add(mscr);
            }
        }
        previousStep.nextStep = nextStep;
    }
    return firstStep;
}

Also used : Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) PCollection(com.tdunning.plume.PCollection) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet)

Example 5 with PCollection

use of com.tdunning.plume.PCollection in project Plume by tdunning.

the class OptimizerTools method getMSCRBlocks.

/**
   * This utility returns all the different MSCR blocks that can be created from this plan
   * 
   * (pere) As of Oct/2010, I think this code can be simplified to be more like addRemainingTrivialMSCRs(), so a possible TODO would be
   *  to refactor it and make it more understandable. An opened question is whether there is an easy way of coding finding all possible
   *  MSCRs (including trivial, not related to GroupByKey operations ones) in a single and elegant loop.
   */
@SuppressWarnings({ "rawtypes", "unchecked" })
static Set<MSCR> getMSCRBlocks(List<PCollection> outputs) {
    // Get all GroupByKeys from the tree
    List<DeferredOp> groupBys = OptimizerTools.getAll(outputs, GroupByKey.class);
    int mscrId = 1;
    Set<MSCR> mscrs = new HashSet<MSCR>();
    // For all found GroupByKey blocks
    for (DeferredOp gBK : groupBys) {
        GroupByKey groupBy = (GroupByKey<?, ?>) gBK;
        // Gather all information needed for MSCR from this GBK
        Set<PCollection<?>> inputs = new HashSet<PCollection<?>>();
        Set<GroupByKey<?, ?>> outputChannels = new HashSet<GroupByKey<?, ?>>();
        Set<Flatten<?>> unGroupedOutputChannels = new HashSet<Flatten<?>>();
        Set<PCollection<?>> bypassChannels = new HashSet<PCollection<?>>();
        Stack<LazyCollection<?>> toVisit = new Stack<LazyCollection<?>>();
        Set<LazyCollection<?>> visited = new HashSet<LazyCollection<?>>();
        LazyCollection<?> origin = (LazyCollection<?>) groupBy.getOrigin();
        toVisit.push(origin);
        outputChannels.add(groupBy);
        while (!toVisit.isEmpty()) {
            LazyCollection<?> current = toVisit.pop();
            visited.add(current);
            if (current.isMaterialized()) {
                // condition for being a materialized input. This may change.
                inputs.add(current);
                continue;
            }
            DeferredOp op = current.getDeferredOp();
            if (op instanceof MultipleParallelDo) {
                // second condition for being an input
                MultipleParallelDo<?> mPDo = (MultipleParallelDo) current.getDeferredOp();
                if (((LazyCollection<?>) mPDo.getOrigin()).isMaterialized()) {
                    // will be done in Mapper
                    inputs.add(mPDo.getOrigin());
                } else if (op instanceof ParallelDo) {
                    // will be done in Reducer
                    inputs.add(current);
                } else {
                    // will be done in Mapper
                    inputs.add(mPDo.getOrigin());
                }
                // Check for bypass channels & output channels with no group-by
                for (Map.Entry entry : mPDo.getDests().entrySet()) {
                    LazyCollection coll = (LazyCollection) entry.getKey();
                    if (coll.getDownOps() == null || coll.getDownOps().size() == 0) {
                        // leaf node
                        bypassChannels.add(coll);
                    } else if (coll.getDownOps().get(0) instanceof MultipleParallelDo) {
                        bypassChannels.add(coll);
                    /*
             * Case of an output channel that Flattens with no Group By
             */
                    } else if (coll.getDownOps().get(0) instanceof Flatten) {
                        Flatten<?> thisFlatten = (Flatten<?>) coll.getDownOps().get(0);
                        LazyCollection ldest = (LazyCollection) thisFlatten.getDest();
                        if (ldest.getDownOps() == null || ldest.getDownOps().size() == 0 || ldest.getDownOps().get(0) instanceof MultipleParallelDo) {
                            unGroupedOutputChannels.add(thisFlatten);
                            // Add the rest of this flatten's origins to the stack in order to possibly discover more output channels
                            for (PCollection<?> col : thisFlatten.getOrigins()) {
                                if (!visited.contains(col)) {
                                    toVisit.push((LazyCollection<?>) col);
                                }
                            }
                        }
                    }
                }
                continue;
            }
            if (op instanceof GroupByKey) {
                // third condition for being an input - rare case when one GBK follows another
                inputs.add(current);
                continue;
            }
            if (op instanceof Flatten) {
                Flatten<?> flatten = (Flatten<?>) op;
                for (PCollection<?> input : flatten.getOrigins()) {
                    LazyCollection<?> in = (LazyCollection<?>) input;
                    if (!visited.contains(in)) {
                        toVisit.push(in);
                    }
                }
                continue;
            }
            if (op instanceof OneToOneOp) {
                LazyCollection<?> input = (LazyCollection<?>) ((OneToOneOp<?, ?>) op).getOrigin();
                if (!visited.contains(input)) {
                    toVisit.push(input);
                }
                continue;
            }
        }
        MSCR mscrToAdd = null;
        // Check if there is already one MSCR with at least one of this inputs
        for (MSCR mscr : mscrs) {
            for (PCollection<?> input : inputs) {
                if (mscr.hasInput(input)) {
                    mscrToAdd = mscr;
                    break;
                }
            }
        }
        if (mscrToAdd == null) {
            // otherwise create new MSCR
            mscrToAdd = new MSCR(mscrId);
            mscrId++;
        }
        // Add all missing input channels to current MSCR
        for (PCollection<?> input : inputs) {
            if (!mscrToAdd.hasInput(input)) {
                mscrToAdd.addInput(input);
            }
        }
        // Add all missing bypass outputs to current MSCR
        for (PCollection<?> col : bypassChannels) {
            if (!mscrToAdd.hasOutputChannel(col)) {
                // Create new by-pass channel
                MSCR.OutputChannel oC = new MSCR.OutputChannel(col);
                mscrToAdd.addOutputChannel(oC);
            }
        }
        // Add all missing flatten-with-no-groupby outputs to current MSCR
        for (Flatten flatten : unGroupedOutputChannels) {
            if (!mscrToAdd.hasOutputChannel(flatten.getDest())) {
                // Create new channel with flatten and nothing else
                MSCR.OutputChannel oC = new MSCR.OutputChannel(flatten.getDest());
                oC.output = flatten.getDest();
                oC.flatten = flatten;
                mscrToAdd.addOutputChannel(oC);
            }
        }
        // Add all missing output channels to current MSCR
        for (GroupByKey groupByKey : outputChannels) {
            if (!mscrToAdd.hasOutputChannel(groupByKey.getOrigin())) {
                // Create new channel with group by key. It might have combiner and reducer as well.
                MSCR.OutputChannel oC = new MSCR.OutputChannel(groupByKey);
                oC.output = groupByKey.getDest();
                if (groupByKey.getOrigin().getDeferredOp() instanceof Flatten) {
                    oC.flatten = (Flatten) groupByKey.getOrigin().getDeferredOp();
                }
                if (groupByKey.getDest().getDownOps() != null && groupByKey.getDest().getDownOps().size() == 1) {
                    DeferredOp op = (DeferredOp) groupByKey.getDest().getDownOps().get(0);
                    if (op instanceof CombineValues) {
                        oC.combiner = (CombineValues) op;
                        oC.output = oC.combiner.getDest();
                        LazyCollection dest = (LazyCollection) oC.combiner.getDest();
                        if (dest.getDownOps() != null && dest.getDownOps().size() == 1) {
                            op = (DeferredOp) dest.getDownOps().get(0);
                        }
                    }
                    if (op instanceof ParallelDo) {
                        oC.reducer = (ParallelDo) op;
                        oC.output = oC.reducer.getDest();
                    }
                }
                mscrToAdd.addOutputChannel(oC);
            }
        }
        // Add if needed
        mscrs.add(mscrToAdd);
    }
    return addRemainingTrivialMSCRs(outputs, mscrId, mscrs);
}

Also used : MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) GroupByKey(com.tdunning.plume.local.lazy.op.GroupByKey) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) HashSet(java.util.HashSet) CombineValues(com.tdunning.plume.local.lazy.op.CombineValues) Flatten(com.tdunning.plume.local.lazy.op.Flatten) OneToOneOp(com.tdunning.plume.local.lazy.op.OneToOneOp) Stack(java.util.Stack) PCollection(com.tdunning.plume.PCollection) Map(java.util.Map)

Aggregations

PCollection (com.tdunning.plume.PCollection)13 DeferredOp (com.tdunning.plume.local.lazy.op.DeferredOp)6 MultipleParallelDo (com.tdunning.plume.local.lazy.op.MultipleParallelDo)6 Flatten (com.tdunning.plume.local.lazy.op.Flatten)5 ParallelDo (com.tdunning.plume.local.lazy.op.ParallelDo)5 Map (java.util.Map)4 DoFn (com.tdunning.plume.DoFn)3 PlumeObject (com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject)3 OneToOneOp (com.tdunning.plume.local.lazy.op.OneToOneOp)3 EmitFn (com.tdunning.plume.EmitFn)2 Pair (com.tdunning.plume.Pair)2 OutputChannel (com.tdunning.plume.local.lazy.MSCR.OutputChannel)2 GroupByKey (com.tdunning.plume.local.lazy.op.GroupByKey)2 PTableType (com.tdunning.plume.types.PTableType)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 WritableComparable (org.apache.hadoop.io.WritableComparable)2 Test (org.junit.Test)2 PTable (com.tdunning.plume.PTable)1