Search in sources :

Example 36 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.

the class Tsmm2SPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    // execute tsmm2 instruction
    // step 1: first pass of X, filter-collect-broadcast excess blocks
    JavaPairRDD<MatrixIndexes, MatrixBlock> tmp1 = in.filter(new IsBlockInRange(_type.isLeft() ? 1 : mc.getRowsPerBlock() + 1, mc.getRows(), _type.isLeft() ? mc.getColsPerBlock() + 1 : 1, mc.getCols(), mc)).mapToPair(new ShiftTSMMIndexesFunction(_type));
    PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(tmp1, (int) (_type.isLeft() ? mc.getRows() : mc.getRows() - mc.getRowsPerBlock()), (int) (_type.isLeft() ? mc.getCols() - mc.getColsPerBlock() : mc.getCols()), mc.getRowsPerBlock(), mc.getColsPerBlock(), -1L);
    Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
    // step 2: second pass of X, compute tsmm/mapmm and aggregate result blocks
    int outputDim = (int) (_type.isLeft() ? mc.getCols() : mc.getRows());
    if (OptimizerUtils.estimateSize(outputDim, outputDim) <= 32 * 1024 * 1024) {
        // default: <=32MB
        // output large blocks and reduceAll to avoid skew on combineByKey
        JavaRDD<MatrixBlock> tmp2 = in.map(new RDDTSMM2ExtFunction(bpmb, _type, outputDim, (int) mc.getRowsPerBlock()));
        MatrixBlock out = RDDAggregateUtils.sumStable(tmp2);
        // put output block into symbol table (no lineage because single block)
        // this also includes implicit maintenance of matrix characteristics
        sec.setMatrixOutput(output.getName(), out, getExtendedOpcode());
    } else {
        // output individual output blocks and aggregate by key (no action)
        JavaPairRDD<MatrixIndexes, MatrixBlock> tmp2 = in.flatMapToPair(new RDDTSMM2Function(bpmb, _type));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.sumByKeyStable(tmp2, false);
        // put output RDD handle into symbol table
        sec.getMatrixCharacteristics(output.getName()).set(outputDim, outputDim, mc.getRowsPerBlock(), mc.getColsPerBlock());
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) IsBlockInRange(org.apache.sysml.runtime.instructions.spark.functions.IsBlockInRange) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) PartitionedBlock(org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 37 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.

the class UnaryMatrixSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    // execute unary builtin operation
    UnaryOperator uop = (UnaryOperator) _optr;
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.mapValues(new RDDMatrixBuiltinUnaryOp(uop));
    // set output RDD
    updateUnaryOutputMatrixCharacteristics(sec);
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) UnaryOperator(org.apache.sysml.runtime.matrix.operators.UnaryOperator)

Example 38 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.

the class CentralMomentSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // parse 'order' input argument
    CPOperand scalarInput = (input3 == null ? input2 : input3);
    ScalarObject order = ec.getScalarInput(scalarInput.getName(), scalarInput.getValueType(), scalarInput.isLiteral());
    CMOperator cop = ((CMOperator) _optr);
    if (cop.getAggOpType() == AggregateOperationTypes.INVALID) {
        cop.setCMAggOp((int) order.getLongValue());
    }
    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    // process central moment instruction
    CM_COV_Object cmobj = null;
    if (// w/o weights
    input3 == null) {
        cmobj = in1.values().map(new RDDCMFunction(cop)).fold(new CM_COV_Object(), new RDDCMReduceFunction(cop));
    } else // with weights
    {
        JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
        cmobj = in1.join(in2).values().map(new RDDCMWeightsFunction(cop)).fold(new CM_COV_Object(), new RDDCMReduceFunction(cop));
    }
    // create scalar output (no lineage information required)
    double val = cmobj.getRequiredResult(_optr);
    ec.setScalarOutput(output.getName(), new DoubleObject(val));
}
Also used : CM_COV_Object(org.apache.sysml.runtime.instructions.cp.CM_COV_Object) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) CPOperand(org.apache.sysml.runtime.instructions.cp.CPOperand) ScalarObject(org.apache.sysml.runtime.instructions.cp.ScalarObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) CMOperator(org.apache.sysml.runtime.matrix.operators.CMOperator)

Example 39 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.

the class CheckpointSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // this is valid if relevant branches are never entered)
    if (sec.getVariable(input1.getName()) == null || sec.getVariable(input1.getName()) instanceof BooleanObject) {
        // add a dummy entry to the input, which will be immediately overwritten by the null output.
        sec.setVariable(input1.getName(), new BooleanObject(false));
        sec.setVariable(output.getName(), new BooleanObject(false));
        return;
    }
    // -------
    // (for csv input files with unknown dimensions, we might have generated a checkpoint after
    // csvreblock although not necessary because the csvreblock was subject to in-memory reblock)
    CacheableData<?> obj = sec.getCacheableData(input1.getName());
    if (obj.isCached(true)) {
        // available in memory
        sec.setVariable(output.getName(), obj);
        return;
    }
    // get input rdd handle (for matrix or frame)
    JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
    MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
    // Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
    // -------
    // Note that persist is an transformation which will be triggered on-demand with the next rdd operations
    // This prevents unnecessary overhead if the dataset is only consumed by cp operations.
    JavaPairRDD<?, ?> out = null;
    if (!in.getStorageLevel().equals(_level)) {
        // (trigger coalesce if intended number of partitions exceeded by 20%
        // and not hash partitioned to avoid losing the existing partitioner)
        int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
        boolean coalesce = (1.2 * numPartitions < in.getNumPartitions() && !SparkUtils.isHashPartitioned(in) && in.getNumPartitions() > SparkExecutionContext.getDefaultParallelism(true));
        // checkpoint pre-processing rdd operations
        if (coalesce) {
            // merge partitions without shuffle if too many partitions
            out = in.coalesce(numPartitions);
        } else {
            // apply a narrow shallow copy to allow for short-circuit collects
            if (input1.getDataType() == DataType.MATRIX)
                out = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) in, false);
            else if (input1.getDataType() == DataType.FRAME)
                out = ((JavaPairRDD<Long, FrameBlock>) in).mapValues(new CopyFrameBlockFunction(false));
        }
        // convert mcsr into memory-efficient csr if potentially sparse
        if (input1.getDataType() == DataType.MATRIX && OptimizerUtils.checkSparseBlockCSRConversion(mcIn) && !_level.equals(Checkpoint.SER_STORAGE_LEVEL)) {
            out = ((JavaPairRDD<MatrixIndexes, MatrixBlock>) out).mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
        }
        // actual checkpoint into given storage level
        out = out.persist(_level);
        // otherwise these their nnz would never be evaluated due to lazy evaluation in spark
        if (input1.isMatrix() && mcIn.dimsKnown() && !mcIn.dimsKnown(true) && !OptimizerUtils.isValidCPDimensions(mcIn)) {
            mcIn.setNonZeros(SparkUtils.getNonZeros((JavaPairRDD<MatrixIndexes, MatrixBlock>) out));
        }
    } else {
        // pass-through
        out = in;
    }
    // Step 3: In-place update of input matrix/frame rdd handle and set as output
    // -------
    // We use this in-place approach for two reasons. First, it is correct because our checkpoint
    // injection rewrites guarantee that after checkpoint instructions there are no consumers on the
    // given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
    // filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
    // caching and subsequent collects. Note that in-place update requires us to explicitly handle
    // lineage information in order to prevent cycles on cleanup.
    CacheableData<?> cd = sec.getCacheableData(input1.getName());
    if (out != in) {
        // prevent unnecessary lineage info
        // guaranteed to exist (see above)
        RDDObject inro = cd.getRDDHandle();
        // create new rdd object
        RDDObject outro = new RDDObject(out);
        // mark as checkpointed
        outro.setCheckpointRDD(true);
        // keep lineage to prevent cycles on cleanup
        outro.addLineageChild(inro);
        cd.setRDDHandle(outro);
    }
    sec.setVariable(output.getName(), cd);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Checkpoint(org.apache.sysml.lops.Checkpoint) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) CreateSparseBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFunction) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) CopyFrameBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockFunction) BooleanObject(org.apache.sysml.runtime.instructions.cp.BooleanObject)

Example 40 with SparkExecutionContext

use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.

the class CtableSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get input rdd handle
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = null;
    JavaPairRDD<MatrixIndexes, MatrixBlock> in3 = null;
    double scalar_input2 = -1, scalar_input3 = -1;
    Ctable.OperationTypes ctableOp = Ctable.findCtableOperationByInputDataTypes(input1.getDataType(), input2.getDataType(), input3.getDataType());
    ctableOp = _isExpand ? Ctable.OperationTypes.CTABLE_EXPAND_SCALAR_WEIGHT : ctableOp;
    MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    // First get the block sizes and then set them as -1 to allow for binary cell reblock
    int brlen = mc1.getRowsPerBlock();
    int bclen = mc1.getColsPerBlock();
    JavaPairRDD<MatrixIndexes, ArrayList<MatrixBlock>> inputMBs = null;
    JavaPairRDD<MatrixIndexes, CTableMap> ctables = null;
    JavaPairRDD<MatrixIndexes, Double> bincellsNoFilter = null;
    boolean setLineage2 = false;
    boolean setLineage3 = false;
    switch(ctableOp) {
        case // (VECTOR)
        CTABLE_TRANSFORM:
            // F=ctable(A,B,W)
            in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
            in3 = sec.getBinaryBlockRDDHandleForVariable(input3.getName());
            setLineage2 = true;
            setLineage3 = true;
            inputMBs = in1.cogroup(in2).cogroup(in3).mapToPair(new MapThreeMBIterableIntoAL());
            ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
            break;
        case // (VECTOR)
        CTABLE_EXPAND_SCALAR_WEIGHT:
            // F = ctable(seq,A) or F = ctable(seq,B,1)
            scalar_input3 = sec.getScalarInput(input3.getName(), input3.getValueType(), input3.isLiteral()).getDoubleValue();
            if (scalar_input3 == 1) {
                in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
                setLineage2 = true;
                bincellsNoFilter = in2.flatMapToPair(new ExpandScalarCtableOperation(brlen));
                break;
            }
        case // (VECTOR/MATRIX)
        CTABLE_TRANSFORM_SCALAR_WEIGHT:
            // F = ctable(A,B) or F = ctable(A,B,1)
            in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
            setLineage2 = true;
            scalar_input3 = sec.getScalarInput(input3.getName(), input3.getValueType(), input3.isLiteral()).getDoubleValue();
            inputMBs = in1.cogroup(in2).mapToPair(new MapTwoMBIterableIntoAL());
            ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
            break;
        case // (VECTOR)
        CTABLE_TRANSFORM_HISTOGRAM:
            // F=ctable(A,1) or F = ctable(A,1,1)
            scalar_input2 = sec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getDoubleValue();
            scalar_input3 = sec.getScalarInput(input3.getName(), input3.getValueType(), input3.isLiteral()).getDoubleValue();
            inputMBs = in1.mapToPair(new MapMBIntoAL());
            ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
            break;
        case // (VECTOR)
        CTABLE_TRANSFORM_WEIGHTED_HISTOGRAM:
            // F=ctable(A,1,W)
            in3 = sec.getBinaryBlockRDDHandleForVariable(input3.getName());
            setLineage3 = true;
            scalar_input2 = sec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getDoubleValue();
            inputMBs = in1.cogroup(in3).mapToPair(new MapTwoMBIterableIntoAL());
            ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
            break;
        default:
            throw new DMLRuntimeException("Encountered an invalid ctable operation (" + ctableOp + ") while executing instruction: " + this.toString());
    }
    // Now perform aggregation on ctables to get binaryCells
    if (bincellsNoFilter == null && ctables != null) {
        bincellsNoFilter = ctables.values().flatMapToPair(new ExtractBinaryCellsFromCTable());
        bincellsNoFilter = RDDAggregateUtils.sumCellsByKeyStable(bincellsNoFilter);
    } else if (!(bincellsNoFilter != null && ctables == null)) {
        throw new DMLRuntimeException("Incorrect ctable operation");
    }
    // handle known/unknown dimensions
    long outputDim1 = (_dim1Literal ? (long) Double.parseDouble(_outDim1) : (sec.getScalarInput(_outDim1, ValueType.DOUBLE, false)).getLongValue());
    long outputDim2 = (_dim2Literal ? (long) Double.parseDouble(_outDim2) : (sec.getScalarInput(_outDim2, ValueType.DOUBLE, false)).getLongValue());
    MatrixCharacteristics mcBinaryCells = null;
    boolean findDimensions = (outputDim1 == -1 && outputDim2 == -1);
    if (!findDimensions) {
        if ((outputDim1 == -1 && outputDim2 != -1) || (outputDim1 != -1 && outputDim2 == -1))
            throw new DMLRuntimeException("Incorrect output dimensions passed to TernarySPInstruction:" + outputDim1 + " " + outputDim2);
        else
            mcBinaryCells = new MatrixCharacteristics(outputDim1, outputDim2, brlen, bclen);
        // filtering according to given dimensions
        bincellsNoFilter = bincellsNoFilter.filter(new FilterCells(mcBinaryCells.getRows(), mcBinaryCells.getCols()));
    }
    // convert double values to matrix cell
    JavaPairRDD<MatrixIndexes, MatrixCell> binaryCells = bincellsNoFilter.mapToPair(new ConvertToBinaryCell());
    // find dimensions if necessary (w/ cache for reblock)
    if (findDimensions) {
        binaryCells = SparkUtils.cacheBinaryCellRDD(binaryCells);
        mcBinaryCells = SparkUtils.computeMatrixCharacteristics(binaryCells);
    }
    // store output rdd handle
    sec.setRDDHandleForVariable(output.getName(), binaryCells);
    mcOut.set(mcBinaryCells);
    // Since we are outputing binary cells, we set block sizes = -1
    mcOut.setRowsPerBlock(-1);
    mcOut.setColsPerBlock(-1);
    sec.addLineageRDD(output.getName(), input1.getName());
    if (setLineage2)
        sec.addLineageRDD(output.getName(), input2.getName());
    if (setLineage3)
        sec.addLineageRDD(output.getName(), input3.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) ArrayList(java.util.ArrayList) Ctable(org.apache.sysml.lops.Ctable) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CTableMap(org.apache.sysml.runtime.matrix.data.CTableMap)

Aggregations

SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)112 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)92 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)92 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)71 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)39 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)22 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)14 DoubleObject (org.apache.sysml.runtime.instructions.cp.DoubleObject)12 ScalarObject (org.apache.sysml.runtime.instructions.cp.ScalarObject)9 PartitionedBroadcast (org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast)8 FilterNonEmptyBlocksFunction (org.apache.sysml.runtime.instructions.spark.functions.FilterNonEmptyBlocksFunction)7 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)7 ArrayList (java.util.ArrayList)6 CPOperand (org.apache.sysml.runtime.instructions.cp.CPOperand)6 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)6 AggregateDropCorrectionFunction (org.apache.sysml.runtime.instructions.spark.functions.AggregateDropCorrectionFunction)6 AggregateOperator (org.apache.sysml.runtime.matrix.operators.AggregateOperator)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)5 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)5 ValueType (org.apache.sysml.parser.Expression.ValueType)4