use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.
the class Tsmm2SPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// get input
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
// execute tsmm2 instruction
// step 1: first pass of X, filter-collect-broadcast excess blocks
JavaPairRDD<MatrixIndexes, MatrixBlock> tmp1 = in.filter(new IsBlockInRange(_type.isLeft() ? 1 : mc.getRowsPerBlock() + 1, mc.getRows(), _type.isLeft() ? mc.getColsPerBlock() + 1 : 1, mc.getCols(), mc)).mapToPair(new ShiftTSMMIndexesFunction(_type));
PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(tmp1, (int) (_type.isLeft() ? mc.getRows() : mc.getRows() - mc.getRowsPerBlock()), (int) (_type.isLeft() ? mc.getCols() - mc.getColsPerBlock() : mc.getCols()), mc.getRowsPerBlock(), mc.getColsPerBlock(), -1L);
Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
// step 2: second pass of X, compute tsmm/mapmm and aggregate result blocks
int outputDim = (int) (_type.isLeft() ? mc.getCols() : mc.getRows());
if (OptimizerUtils.estimateSize(outputDim, outputDim) <= 32 * 1024 * 1024) {
// default: <=32MB
// output large blocks and reduceAll to avoid skew on combineByKey
JavaRDD<MatrixBlock> tmp2 = in.map(new RDDTSMM2ExtFunction(bpmb, _type, outputDim, (int) mc.getRowsPerBlock()));
MatrixBlock out = RDDAggregateUtils.sumStable(tmp2);
// put output block into symbol table (no lineage because single block)
// this also includes implicit maintenance of matrix characteristics
sec.setMatrixOutput(output.getName(), out, getExtendedOpcode());
} else {
// output individual output blocks and aggregate by key (no action)
JavaPairRDD<MatrixIndexes, MatrixBlock> tmp2 = in.flatMapToPair(new RDDTSMM2Function(bpmb, _type));
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.sumByKeyStable(tmp2, false);
// put output RDD handle into symbol table
sec.getMatrixCharacteristics(output.getName()).set(outputDim, outputDim, mc.getRowsPerBlock(), mc.getColsPerBlock());
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
}
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.
the class UnaryMatrixSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// get input
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
// execute unary builtin operation
UnaryOperator uop = (UnaryOperator) _optr;
JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.mapValues(new RDDMatrixBuiltinUnaryOp(uop));
// set output RDD
updateUnaryOutputMatrixCharacteristics(sec);
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.
the class CentralMomentSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// parse 'order' input argument
CPOperand scalarInput = (input3 == null ? input2 : input3);
ScalarObject order = ec.getScalarInput(scalarInput.getName(), scalarInput.getValueType(), scalarInput.isLiteral());
CMOperator cop = ((CMOperator) _optr);
if (cop.getAggOpType() == AggregateOperationTypes.INVALID) {
cop.setCMAggOp((int) order.getLongValue());
}
// get input
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
// process central moment instruction
CM_COV_Object cmobj = null;
if (// w/o weights
input3 == null) {
cmobj = in1.values().map(new RDDCMFunction(cop)).fold(new CM_COV_Object(), new RDDCMReduceFunction(cop));
} else // with weights
{
JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
cmobj = in1.join(in2).values().map(new RDDCMWeightsFunction(cop)).fold(new CM_COV_Object(), new RDDCMReduceFunction(cop));
}
// create scalar output (no lineage information required)
double val = cmobj.getRequiredResult(_optr);
ec.setScalarOutput(output.getName(), new DoubleObject(val));
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.
the class CheckpointSPInstruction method processInstruction.
@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// this is valid if relevant branches are never entered)
if (sec.getVariable(input1.getName()) == null || sec.getVariable(input1.getName()) instanceof BooleanObject) {
// add a dummy entry to the input, which will be immediately overwritten by the null output.
sec.setVariable(input1.getName(), new BooleanObject(false));
sec.setVariable(output.getName(), new BooleanObject(false));
return;
}
// -------
// (for csv input files with unknown dimensions, we might have generated a checkpoint after
// csvreblock although not necessary because the csvreblock was subject to in-memory reblock)
CacheableData<?> obj = sec.getCacheableData(input1.getName());
if (obj.isCached(true)) {
// available in memory
sec.setVariable(output.getName(), obj);
return;
}
// get input rdd handle (for matrix or frame)
JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
// Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
// -------
// Note that persist is an transformation which will be triggered on-demand with the next rdd operations
// This prevents unnecessary overhead if the dataset is only consumed by cp operations.
JavaPairRDD<?, ?> out = null;
if (!in.getStorageLevel().equals(_level)) {
// (trigger coalesce if intended number of partitions exceeded by 20%
// and not hash partitioned to avoid losing the existing partitioner)
int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
boolean coalesce = (1.2 * numPartitions < in.getNumPartitions() && !SparkUtils.isHashPartitioned(in) && in.getNumPartitions() > SparkExecutionContext.getDefaultParallelism(true));
// checkpoint pre-processing rdd operations
if (coalesce) {
// merge partitions without shuffle if too many partitions
out = in.coalesce(numPartitions);
} else {
// apply a narrow shallow copy to allow for short-circuit collects
if (input1.getDataType() == DataType.MATRIX)
out = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) in, false);
else if (input1.getDataType() == DataType.FRAME)
out = ((JavaPairRDD<Long, FrameBlock>) in).mapValues(new CopyFrameBlockFunction(false));
}
// convert mcsr into memory-efficient csr if potentially sparse
if (input1.getDataType() == DataType.MATRIX && OptimizerUtils.checkSparseBlockCSRConversion(mcIn) && !_level.equals(Checkpoint.SER_STORAGE_LEVEL)) {
out = ((JavaPairRDD<MatrixIndexes, MatrixBlock>) out).mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
}
// actual checkpoint into given storage level
out = out.persist(_level);
// otherwise these their nnz would never be evaluated due to lazy evaluation in spark
if (input1.isMatrix() && mcIn.dimsKnown() && !mcIn.dimsKnown(true) && !OptimizerUtils.isValidCPDimensions(mcIn)) {
mcIn.setNonZeros(SparkUtils.getNonZeros((JavaPairRDD<MatrixIndexes, MatrixBlock>) out));
}
} else {
// pass-through
out = in;
}
// Step 3: In-place update of input matrix/frame rdd handle and set as output
// -------
// We use this in-place approach for two reasons. First, it is correct because our checkpoint
// injection rewrites guarantee that after checkpoint instructions there are no consumers on the
// given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
// filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
// caching and subsequent collects. Note that in-place update requires us to explicitly handle
// lineage information in order to prevent cycles on cleanup.
CacheableData<?> cd = sec.getCacheableData(input1.getName());
if (out != in) {
// prevent unnecessary lineage info
// guaranteed to exist (see above)
RDDObject inro = cd.getRDDHandle();
// create new rdd object
RDDObject outro = new RDDObject(out);
// mark as checkpointed
outro.setCheckpointRDD(true);
// keep lineage to prevent cycles on cleanup
outro.addLineageChild(inro);
cd.setRDDHandle(outro);
}
sec.setVariable(output.getName(), cd);
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.
the class CtableSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// get input rdd handle
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = null;
JavaPairRDD<MatrixIndexes, MatrixBlock> in3 = null;
double scalar_input2 = -1, scalar_input3 = -1;
Ctable.OperationTypes ctableOp = Ctable.findCtableOperationByInputDataTypes(input1.getDataType(), input2.getDataType(), input3.getDataType());
ctableOp = _isExpand ? Ctable.OperationTypes.CTABLE_EXPAND_SCALAR_WEIGHT : ctableOp;
MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(input1.getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
// First get the block sizes and then set them as -1 to allow for binary cell reblock
int brlen = mc1.getRowsPerBlock();
int bclen = mc1.getColsPerBlock();
JavaPairRDD<MatrixIndexes, ArrayList<MatrixBlock>> inputMBs = null;
JavaPairRDD<MatrixIndexes, CTableMap> ctables = null;
JavaPairRDD<MatrixIndexes, Double> bincellsNoFilter = null;
boolean setLineage2 = false;
boolean setLineage3 = false;
switch(ctableOp) {
case // (VECTOR)
CTABLE_TRANSFORM:
// F=ctable(A,B,W)
in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
in3 = sec.getBinaryBlockRDDHandleForVariable(input3.getName());
setLineage2 = true;
setLineage3 = true;
inputMBs = in1.cogroup(in2).cogroup(in3).mapToPair(new MapThreeMBIterableIntoAL());
ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
break;
case // (VECTOR)
CTABLE_EXPAND_SCALAR_WEIGHT:
// F = ctable(seq,A) or F = ctable(seq,B,1)
scalar_input3 = sec.getScalarInput(input3.getName(), input3.getValueType(), input3.isLiteral()).getDoubleValue();
if (scalar_input3 == 1) {
in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
setLineage2 = true;
bincellsNoFilter = in2.flatMapToPair(new ExpandScalarCtableOperation(brlen));
break;
}
case // (VECTOR/MATRIX)
CTABLE_TRANSFORM_SCALAR_WEIGHT:
// F = ctable(A,B) or F = ctable(A,B,1)
in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
setLineage2 = true;
scalar_input3 = sec.getScalarInput(input3.getName(), input3.getValueType(), input3.isLiteral()).getDoubleValue();
inputMBs = in1.cogroup(in2).mapToPair(new MapTwoMBIterableIntoAL());
ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
break;
case // (VECTOR)
CTABLE_TRANSFORM_HISTOGRAM:
// F=ctable(A,1) or F = ctable(A,1,1)
scalar_input2 = sec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getDoubleValue();
scalar_input3 = sec.getScalarInput(input3.getName(), input3.getValueType(), input3.isLiteral()).getDoubleValue();
inputMBs = in1.mapToPair(new MapMBIntoAL());
ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
break;
case // (VECTOR)
CTABLE_TRANSFORM_WEIGHTED_HISTOGRAM:
// F=ctable(A,1,W)
in3 = sec.getBinaryBlockRDDHandleForVariable(input3.getName());
setLineage3 = true;
scalar_input2 = sec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getDoubleValue();
inputMBs = in1.cogroup(in3).mapToPair(new MapTwoMBIterableIntoAL());
ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
break;
default:
throw new DMLRuntimeException("Encountered an invalid ctable operation (" + ctableOp + ") while executing instruction: " + this.toString());
}
// Now perform aggregation on ctables to get binaryCells
if (bincellsNoFilter == null && ctables != null) {
bincellsNoFilter = ctables.values().flatMapToPair(new ExtractBinaryCellsFromCTable());
bincellsNoFilter = RDDAggregateUtils.sumCellsByKeyStable(bincellsNoFilter);
} else if (!(bincellsNoFilter != null && ctables == null)) {
throw new DMLRuntimeException("Incorrect ctable operation");
}
// handle known/unknown dimensions
long outputDim1 = (_dim1Literal ? (long) Double.parseDouble(_outDim1) : (sec.getScalarInput(_outDim1, ValueType.DOUBLE, false)).getLongValue());
long outputDim2 = (_dim2Literal ? (long) Double.parseDouble(_outDim2) : (sec.getScalarInput(_outDim2, ValueType.DOUBLE, false)).getLongValue());
MatrixCharacteristics mcBinaryCells = null;
boolean findDimensions = (outputDim1 == -1 && outputDim2 == -1);
if (!findDimensions) {
if ((outputDim1 == -1 && outputDim2 != -1) || (outputDim1 != -1 && outputDim2 == -1))
throw new DMLRuntimeException("Incorrect output dimensions passed to TernarySPInstruction:" + outputDim1 + " " + outputDim2);
else
mcBinaryCells = new MatrixCharacteristics(outputDim1, outputDim2, brlen, bclen);
// filtering according to given dimensions
bincellsNoFilter = bincellsNoFilter.filter(new FilterCells(mcBinaryCells.getRows(), mcBinaryCells.getCols()));
}
// convert double values to matrix cell
JavaPairRDD<MatrixIndexes, MatrixCell> binaryCells = bincellsNoFilter.mapToPair(new ConvertToBinaryCell());
// find dimensions if necessary (w/ cache for reblock)
if (findDimensions) {
binaryCells = SparkUtils.cacheBinaryCellRDD(binaryCells);
mcBinaryCells = SparkUtils.computeMatrixCharacteristics(binaryCells);
}
// store output rdd handle
sec.setRDDHandleForVariable(output.getName(), binaryCells);
mcOut.set(mcBinaryCells);
// Since we are outputing binary cells, we set block sizes = -1
mcOut.setRowsPerBlock(-1);
mcOut.setColsPerBlock(-1);
sec.addLineageRDD(output.getName(), input1.getName());
if (setLineage2)
sec.addLineageRDD(output.getName(), input2.getName());
if (setLineage3)
sec.addLineageRDD(output.getName(), input3.getName());
}
Aggregations