Search in sources :

Example 66 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.

the class BinarySPInstruction method processMatrixBVectorBinaryInstruction.

protected void processMatrixBVectorBinaryInstruction(ExecutionContext ec, VectorType vtype) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // sanity check dimensions
    checkMatrixMatrixBinaryCharacteristics(sec);
    // get input RDDs
    String rddVar = input1.getName();
    String bcastVar = input2.getName();
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
    PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable(bcastVar);
    MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(rddVar);
    MatrixCharacteristics mc2 = sec.getMatrixCharacteristics(bcastVar);
    BinaryOperator bop = (BinaryOperator) _optr;
    boolean isOuter = (mc1.getRows() > 1 && mc1.getCols() == 1 && mc2.getRows() == 1 && mc2.getCols() > 1);
    // execute map binary operation
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
    if (isOuter) {
        out = in1.flatMapToPair(new OuterVectorBinaryOpFunction(bop, in2));
    } else {
        // default
        // note: we use mappartition in order to preserve partitioning information for
        // binary mv operations where the keys are guaranteed not to change, the reason
        // why we cannot use mapValues is the need for broadcast key lookups.
        // alternative: out = in1.mapToPair(new MatrixVectorBinaryOpFunction(bop, in2, vtype));
        out = in1.mapPartitionsToPair(new MatrixVectorBinaryOpPartitionFunction(bop, in2, vtype), true);
    }
    // set output RDD
    updateBinaryOutputMatrixCharacteristics(sec);
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), rddVar);
    sec.addLineageBroadcast(output.getName(), bcastVar);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixVectorBinaryOpPartitionFunction(org.apache.sysml.runtime.instructions.spark.functions.MatrixVectorBinaryOpPartitionFunction) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) BinaryOperator(org.apache.sysml.runtime.matrix.operators.BinaryOperator) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) OuterVectorBinaryOpFunction(org.apache.sysml.runtime.instructions.spark.functions.OuterVectorBinaryOpFunction)

Example 67 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.

the class BuiltinNarySPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    boolean cbind = getOpcode().equals("cbind");
    // compute output characteristics
    MatrixCharacteristics mcOut = computeOutputMatrixCharacteristics(sec, inputs, cbind);
    // get consolidated input via union over shifted and padded inputs
    MatrixCharacteristics off = new MatrixCharacteristics(0, 0, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), 0);
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
    for (CPOperand input : inputs) {
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input.getName());
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input.getName()).flatMapToPair(new ShiftMatrix(off, mcIn, cbind)).mapToPair(// just padding
        new PadBlocksFunction(mcOut));
        out = (out != null) ? out.union(in) : in;
        updateMatrixCharacteristics(mcIn, off, cbind);
    }
    // aggregate partially overlapping blocks w/ single shuffle
    int numPartOut = SparkUtils.getNumPreferredPartitions(mcOut);
    out = RDDAggregateUtils.mergeByKey(out, numPartOut, false);
    // set output RDD and add lineage
    sec.getMatrixCharacteristics(output.getName()).set(mcOut);
    sec.setRDDHandleForVariable(output.getName(), out);
    for (CPOperand input : inputs) sec.addLineageRDD(output.getName(), input.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ShiftMatrix(org.apache.sysml.runtime.instructions.spark.AppendGSPInstruction.ShiftMatrix) CPOperand(org.apache.sysml.runtime.instructions.cp.CPOperand) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 68 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.

the class CheckpointSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // this is valid if relevant branches are never entered)
    if (sec.getVariable(input1.getName()) == null || sec.getVariable(input1.getName()) instanceof BooleanObject) {
        // add a dummy entry to the input, which will be immediately overwritten by the null output.
        sec.setVariable(input1.getName(), new BooleanObject(false));
        sec.setVariable(output.getName(), new BooleanObject(false));
        return;
    }
    // -------
    // (for csv input files with unknown dimensions, we might have generated a checkpoint after
    // csvreblock although not necessary because the csvreblock was subject to in-memory reblock)
    CacheableData<?> obj = sec.getCacheableData(input1.getName());
    if (obj.isCached(true)) {
        // available in memory
        sec.setVariable(output.getName(), obj);
        return;
    }
    // get input rdd handle (for matrix or frame)
    JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
    MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
    // Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
    // -------
    // Note that persist is an transformation which will be triggered on-demand with the next rdd operations
    // This prevents unnecessary overhead if the dataset is only consumed by cp operations.
    JavaPairRDD<?, ?> out = null;
    if (!in.getStorageLevel().equals(_level)) {
        // (trigger coalesce if intended number of partitions exceeded by 20%
        // and not hash partitioned to avoid losing the existing partitioner)
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
        boolean coalesce = (1.2 * numPartitions < in.getNumPartitions() && !SparkUtils.isHashPartitioned(in) && in.getNumPartitions() > SparkExecutionContext.getDefaultParallelism(true));
        // checkpoint pre-processing rdd operations
        if (coalesce) {
            // merge partitions without shuffle if too many partitions
            out = in.coalesce(numPartitions);
        } else {
            // apply a narrow shallow copy to allow for short-circuit collects
            if (input1.getDataType() == DataType.MATRIX)
                out = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) in, false);
            else if (input1.getDataType() == DataType.FRAME)
                out = ((JavaPairRDD<Long, FrameBlock>) in).mapValues(new CopyFrameBlockFunction(false));
        }
        // convert mcsr into memory-efficient csr if potentially sparse
        if (input1.getDataType() == DataType.MATRIX && OptimizerUtils.checkSparseBlockCSRConversion(mcIn) && !_level.equals(Checkpoint.SER_STORAGE_LEVEL)) {
            out = ((JavaPairRDD<MatrixIndexes, MatrixBlock>) out).mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
        }
        // actual checkpoint into given storage level
        out = out.persist(_level);
        // otherwise these their nnz would never be evaluated due to lazy evaluation in spark
        if (input1.isMatrix() && mcIn.dimsKnown() && !mcIn.dimsKnown(true) && !OptimizerUtils.isValidCPDimensions(mcIn)) {
            mcIn.setNonZeros(SparkUtils.getNonZeros((JavaPairRDD<MatrixIndexes, MatrixBlock>) out));
        }
    } else {
        // pass-through
        out = in;
    }
    // Step 3: In-place update of input matrix/frame rdd handle and set as output
    // -------
    // We use this in-place approach for two reasons. First, it is correct because our checkpoint
    // injection rewrites guarantee that after checkpoint instructions there are no consumers on the
    // given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
    // filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
    // caching and subsequent collects. Note that in-place update requires us to explicitly handle
    // lineage information in order to prevent cycles on cleanup.
    CacheableData<?> cd = sec.getCacheableData(input1.getName());
    if (out != in) {
        // prevent unnecessary lineage info
        // guaranteed to exist (see above)
        RDDObject inro = cd.getRDDHandle();
        // create new rdd object
        RDDObject outro = new RDDObject(out);
        // mark as checkpointed
        outro.setCheckpointRDD(true);
        // keep lineage to prevent cycles on cleanup
        outro.addLineageChild(inro);
        cd.setRDDHandle(outro);
    }
    sec.setVariable(output.getName(), cd);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Checkpoint(org.apache.sysml.lops.Checkpoint) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) CreateSparseBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFunction) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) CopyFrameBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockFunction) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject)

Example 69 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.

the class CompressionSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get input rdd handle
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    // execute compression
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.mapValues(new CompressionFunction());
    // set outputs
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(input1.getName(), output.getName());
}
Also used : CompressedMatrixBlock(org.apache.sysml.runtime.compress.CompressedMatrixBlock) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 70 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.

the class ConvolutionSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    if (instOpcode.equalsIgnoreCase("conv2d") || instOpcode.equalsIgnoreCase("conv2d_bias_add") || instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
        String rddVar = input1.getName();
        int numRowsPerBlock = 1;
        JavaPairRDD<MatrixIndexes, MatrixBlock> inputRDD = reblockAsRectangularMatrices(sec, rddVar, numRowsPerBlock);
        MatrixCharacteristics mcRdd = sec.getMatrixCharacteristics(rddVar);
        // ------------------------------------
        // TODO: Handle large filters > 2G
        Broadcast<MatrixBlock> filterBroadcast = null;
        Broadcast<MatrixBlock> biasBroadcast = null;
        if (instOpcode.equalsIgnoreCase("conv2d")) {
            filterBroadcast = getBroadcast(sec, _in2.getName());
        } else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
            filterBroadcast = getBroadcast(sec, _in3.getName());
            biasBroadcast = getBroadcast(sec, _in2.getName());
        }
        // ------------------------------------
        int pad_h = getScalarInput(ec, _padding, 0);
        int pad_w = getScalarInput(ec, _padding, 1);
        int stride_h = getScalarInput(ec, _stride, 0);
        int stride_w = getScalarInput(ec, _stride, 1);
        // int N = getScalarInput(ec, _input_shape, 0);
        int C = getScalarInput(ec, _input_shape, 1);
        int H = getScalarInput(ec, _input_shape, 2);
        int W = getScalarInput(ec, _input_shape, 3);
        int K = getScalarInput(ec, _filter_shape, 0);
        int R = getScalarInput(ec, _filter_shape, 2);
        int S = getScalarInput(ec, _filter_shape, 3);
        int P = (int) ConvolutionUtils.getP(H, R, stride_h, pad_h);
        int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
        ConvolutionParameters params = new ConvolutionParameters(numRowsPerBlock, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, 1);
        boolean enableNativeBLAS = NativeHelper.isNativeLibraryLoaded();
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows(), enableNativeBLAS), true);
        // put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), rddVar);
        // TODO: Handle nnz
        long nnz = -1;
        long numCols = ((long) K) * ((long) P) * ((long) Q);
        if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
            numCols = ((long) C) * ((long) P) * ((long) Q);
        }
        if (numCols > Integer.MAX_VALUE) {
            throw new DMLRuntimeException("The current operator doesnot support large outputs.");
        }
        sec.setMetaData(output.getName(), new MetaDataFormat(new MatrixCharacteristics(mcRdd.getRows(), numCols, numRowsPerBlock, (int) numCols, nnz), OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo));
    } else {
        throw new DMLRuntimeException("Not implemented: " + instOpcode);
    }
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) ConvolutionParameters(org.apache.sysml.runtime.matrix.data.ConvolutionParameters) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Aggregations

SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)112 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)92 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)92 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)71 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)39 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)22 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)14 DoubleObject (org.apache.sysml.runtime.instructions.cp.DoubleObject)12 ScalarObject (org.apache.sysml.runtime.instructions.cp.ScalarObject)9 PartitionedBroadcast (org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast)8 FilterNonEmptyBlocksFunction (org.apache.sysml.runtime.instructions.spark.functions.FilterNonEmptyBlocksFunction)7 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)7 ArrayList (java.util.ArrayList)6 CPOperand (org.apache.sysml.runtime.instructions.cp.CPOperand)6 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)6 AggregateDropCorrectionFunction (org.apache.sysml.runtime.instructions.spark.functions.AggregateDropCorrectionFunction)6 AggregateOperator (org.apache.sysml.runtime.matrix.operators.AggregateOperator)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)5 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)5 ValueType (org.apache.sysml.parser.Expression.ValueType)4