Examples with JavaPairRDD - org.apache.spark.api.java.JavaPairRDD

Example 71 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.

the class ResultMergeRemoteSpark method executeMerge.

@SuppressWarnings("unchecked")
protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, long rlen, long clen, int brlen, int bclen) {
    String jobname = "ParFor-RMSP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    SparkExecutionContext sec = (SparkExecutionContext) _ec;
    boolean withCompare = (compare != null);
    RDDObject ret = null;
    // determine degree of parallelism
    int numRed = (int) determineNumReducers(rlen, clen, brlen, bclen, _numReducers);
    // sanity check for empty src files
    if (inputs == null || inputs.length == 0)
        throw new DMLRuntimeException("Execute merge should never be called with no inputs.");
    try {
        // note: initial implementation via union over all result rdds discarded due to
        // stack overflow errors with many parfor tasks, and thus many rdds
        // Step 1: construct input rdd from all result files of parfor workers
        // a) construct job conf with all files
        InputInfo ii = InputInfo.BinaryBlockInputInfo;
        JobConf job = new JobConf(ResultMergeRemoteMR.class);
        job.setJobName(jobname);
        job.setInputFormat(ii.inputFormatClass);
        Path[] paths = new Path[inputs.length];
        for (int i = 0; i < paths.length; i++) {
            // ensure input exists on hdfs (e.g., if in-memory or RDD)
            inputs[i].exportData();
            paths[i] = new Path(inputs[i].getFileName());
            // update rdd handle to allow lazy evaluation by guarding
            // against cleanup of temporary result files
            setRDDHandleForMerge(inputs[i], sec);
        }
        FileInputFormat.setInputPaths(job, paths);
        // b) create rdd from input files w/ deep copy of keys and blocks
        JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext().hadoopRDD(job, ii.inputFormatClass, ii.inputKeyClass, ii.inputValueClass).mapPartitionsToPair(new CopyBlockPairFunction(true), true);
        // Step 2a: merge with compare
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
        if (withCompare) {
            JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(compare, InputInfo.BinaryBlockInputInfo);
            // merge values which differ from compare values
            ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare(_isAccum);
            out = // group all result blocks per key
            rdd.groupByKey(numRed).join(// join compare block and result blocks
            compareRdd).mapToPair(// merge result blocks w/ compare
            cfun);
        } else // Step 2b: merge without compare
        {
            // direct merge in any order (disjointness guaranteed)
            out = _isAccum ? RDDAggregateUtils.sumByKeyStable(rdd, false) : RDDAggregateUtils.mergeByKey(rdd, false);
        }
        // Step 3: create output rdd handle w/ lineage
        ret = new RDDObject(out);
        for (int i = 0; i < paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle());
        if (withCompare)
            ret.addLineageChild(compare.getRDDHandle());
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
        Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }
    return ret;
}

Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) CopyBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyBlockPairFunction) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JobConf(org.apache.hadoop.mapred.JobConf)

Example 72 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.

the class CheckpointSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // this is valid if relevant branches are never entered)
    if (sec.getVariable(input1.getName()) == null || sec.getVariable(input1.getName()) instanceof BooleanObject) {
        // add a dummy entry to the input, which will be immediately overwritten by the null output.
        sec.setVariable(input1.getName(), new BooleanObject(false));
        sec.setVariable(output.getName(), new BooleanObject(false));
        return;
    }
    // -------
    // (for csv input files with unknown dimensions, we might have generated a checkpoint after
    // csvreblock although not necessary because the csvreblock was subject to in-memory reblock)
    CacheableData<?> obj = sec.getCacheableData(input1.getName());
    if (obj.isCached(true)) {
        // available in memory
        sec.setVariable(output.getName(), obj);
        return;
    }
    // get input rdd handle (for matrix or frame)
    JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
    MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
    // Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
    // -------
    // Note that persist is an transformation which will be triggered on-demand with the next rdd operations
    // This prevents unnecessary overhead if the dataset is only consumed by cp operations.
    JavaPairRDD<?, ?> out = null;
    if (!in.getStorageLevel().equals(_level)) {
        // (trigger coalesce if intended number of partitions exceeded by 20%
        // and not hash partitioned to avoid losing the existing partitioner)
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
        boolean coalesce = (1.2 * numPartitions < in.getNumPartitions() && !SparkUtils.isHashPartitioned(in) && in.getNumPartitions() > SparkExecutionContext.getDefaultParallelism(true));
        // checkpoint pre-processing rdd operations
        if (coalesce) {
            // merge partitions without shuffle if too many partitions
            out = in.coalesce(numPartitions);
        } else {
            // apply a narrow shallow copy to allow for short-circuit collects
            if (input1.getDataType() == DataType.MATRIX)
                out = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) in, false);
            else if (input1.getDataType() == DataType.FRAME)
                out = ((JavaPairRDD<Long, FrameBlock>) in).mapValues(new CopyFrameBlockFunction(false));
        }
        // convert mcsr into memory-efficient csr if potentially sparse
        if (input1.getDataType() == DataType.MATRIX && OptimizerUtils.checkSparseBlockCSRConversion(mcIn) && !_level.equals(Checkpoint.SER_STORAGE_LEVEL)) {
            out = ((JavaPairRDD<MatrixIndexes, MatrixBlock>) out).mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
        }
        // actual checkpoint into given storage level
        out = out.persist(_level);
        // otherwise these their nnz would never be evaluated due to lazy evaluation in spark
        if (input1.isMatrix() && mcIn.dimsKnown() && !mcIn.dimsKnown(true) && !OptimizerUtils.isValidCPDimensions(mcIn)) {
            mcIn.setNonZeros(SparkUtils.getNonZeros((JavaPairRDD<MatrixIndexes, MatrixBlock>) out));
        }
    } else {
        // pass-through
        out = in;
    }
    // Step 3: In-place update of input matrix/frame rdd handle and set as output
    // -------
    // We use this in-place approach for two reasons. First, it is correct because our checkpoint
    // injection rewrites guarantee that after checkpoint instructions there are no consumers on the
    // given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
    // filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
    // caching and subsequent collects. Note that in-place update requires us to explicitly handle
    // lineage information in order to prevent cycles on cleanup.
    CacheableData<?> cd = sec.getCacheableData(input1.getName());
    if (out != in) {
        // prevent unnecessary lineage info
        // guaranteed to exist (see above)
        RDDObject inro = cd.getRDDHandle();
        // create new rdd object
        RDDObject outro = new RDDObject(out);
        // mark as checkpointed
        outro.setCheckpointRDD(true);
        // keep lineage to prevent cycles on cleanup
        outro.addLineageChild(inro);
        cd.setRDDHandle(outro);
    }
    sec.setVariable(output.getName(), cd);
}

Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Checkpoint(org.apache.sysml.lops.Checkpoint) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) CreateSparseBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFunction) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) CopyFrameBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockFunction) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject)

Example 73 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.

the class QuantilePickSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get input rdds
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    // (in contrast to cp instructions, w/o weights does not materializes weights of 1)
    switch(_type) {
        case VALUEPICK:
            {
                ScalarObject quantile = ec.getScalarInput(input2);
                double[] wt = getWeightedQuantileSummary(in, mc, quantile.getDoubleValue());
                ec.setScalarOutput(output.getName(), new DoubleObject(wt[3]));
                break;
            }
        case MEDIAN:
            {
                double[] wt = getWeightedQuantileSummary(in, mc, 0.5);
                ec.setScalarOutput(output.getName(), new DoubleObject(wt[3]));
                break;
            }
        case IQM:
            {
                double[] wt = getWeightedQuantileSummary(in, mc, 0.25, 0.75);
                long key25 = (long) Math.ceil(wt[1]);
                long key75 = (long) Math.ceil(wt[2]);
                JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.filter(new FilterFunction(key25 + 1, key75, mc.getRowsPerBlock())).mapToPair(new ExtractAndSumFunction(key25 + 1, key75, mc.getRowsPerBlock()));
                double sum = RDDAggregateUtils.sumStable(out).getValue(0, 0);
                double val = MatrixBlock.computeIQMCorrection(sum, wt[0], wt[3], wt[5], wt[4], wt[6]);
                ec.setScalarOutput(output.getName(), new DoubleObject(val));
                break;
            }
        default:
            throw new DMLRuntimeException("Unsupported qpick operation type: " + _type);
    }
}

Also used : ScalarObject(org.apache.sysml.runtime.instructions.cp.ScalarObject) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 74 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.

the class ReblockSPInstruction method processFrameReblockInstruction.

@SuppressWarnings("unchecked")
protected void processFrameReblockInstruction(SparkExecutionContext sec, InputInfo iinfo) {
    FrameObject fo = sec.getFrameObject(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (iinfo == InputInfo.TextCellInputInfo) {
        // get the input textcell rdd
        JavaPairRDD<LongWritable, Text> lines = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
        // convert textcell to binary block
        JavaPairRDD<Long, FrameBlock> out = FrameRDDConverterUtils.textCellToBinaryBlock(sec.getSparkContext(), lines, mcOut, fo.getSchema());
        // put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    } else if (iinfo == InputInfo.CSVInputInfo) {
        // HACK ALERT: Until we introduces the rewrite to insert csvrblock for non-persistent read
        // throw new DMLRuntimeException("CSVInputInfo is not supported for ReblockSPInstruction");
        CSVReblockSPInstruction csvInstruction = null;
        boolean hasHeader = false;
        String delim = ",";
        boolean fill = false;
        double fillValue = 0;
        if (fo.getFileFormatProperties() instanceof CSVFileFormatProperties && fo.getFileFormatProperties() != null) {
            CSVFileFormatProperties props = (CSVFileFormatProperties) fo.getFileFormatProperties();
            hasHeader = props.hasHeader();
            delim = props.getDelim();
            fill = props.isFill();
            fillValue = props.getFillValue();
        }
        csvInstruction = new CSVReblockSPInstruction(null, input1, output, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), hasHeader, delim, fill, fillValue, "csvrblk", instString);
        csvInstruction.processInstruction(sec);
    } else {
        throw new DMLRuntimeException("The given InputInfo is not implemented " + "for ReblockSPInstruction: " + InputInfo.inputInfoToString(iinfo));
    }
}

Also used : CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) Text(org.apache.hadoop.io.Text) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritable(org.apache.hadoop.io.LongWritable)

Example 75 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project incubator-systemml by apache.

the class SpoofSPInstruction method createJoinedInputRDD.

private static JavaPairRDD<MatrixIndexes, MatrixBlock[]> createJoinedInputRDD(SparkExecutionContext sec, CPOperand[] inputs, boolean[] bcVect, boolean outer) {
    // get input rdd for main input
    int main = getMainInputIndex(inputs, bcVect);
    MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(inputs[main].getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(inputs[main].getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock[]> ret = in.mapValues(new MapInputSignature());
    for (int i = 0; i < inputs.length; i++) if (i != main && inputs[i].getDataType().isMatrix() && !bcVect[i]) {
        // create side input rdd
        String varname = inputs[i].getName();
        JavaPairRDD<MatrixIndexes, MatrixBlock> tmp = sec.getBinaryBlockRDDHandleForVariable(varname);
        MatrixCharacteristics mcTmp = sec.getMatrixCharacteristics(varname);
        // replicate blocks if mismatch with main input
        if (outer && i == 2)
            tmp = tmp.flatMapToPair(new ReplicateRightFactorFunction(mcIn.getRows(), mcIn.getRowsPerBlock()));
        else if (mcIn.getNumRowBlocks() > mcTmp.getNumRowBlocks())
            tmp = tmp.flatMapToPair(new ReplicateBlockFunction(mcIn.getRows(), mcIn.getRowsPerBlock(), false));
        else if (mcIn.getNumColBlocks() > mcTmp.getNumColBlocks())
            tmp = tmp.flatMapToPair(new ReplicateBlockFunction(mcIn.getCols(), mcIn.getColsPerBlock(), true));
        // join main and side inputs and consolidate signature
        ret = ret.join(tmp).mapValues(new MapJoinSignature());
    }
    return ret;
}

Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) ReplicateBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.ReplicateBlockFunction)

Aggregations

JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)99 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)44 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)42 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)42 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)41 Tuple2 (scala.Tuple2)35 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)33 JavaRDD (org.apache.spark.api.java.JavaRDD)28 List (java.util.List)27 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)24 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)23 Collectors (java.util.stream.Collectors)22 IOException (java.io.IOException)17 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)16 LongWritable (org.apache.hadoop.io.LongWritable)15 Broadcast (org.apache.spark.broadcast.Broadcast)15 Text (org.apache.hadoop.io.Text)12 UserException (org.broadinstitute.hellbender.exceptions.UserException)12 Function (org.apache.spark.api.java.function.Function)11 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)11