use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.
the class MultiReturnParameterizedBuiltinSPInstruction method processInstruction.
@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
try {
// get input RDD and meta data
FrameObject fo = sec.getFrameObject(input1.getName());
FrameObject fometa = sec.getFrameObject(_outputs.get(1).getName());
JavaPairRDD<Long, FrameBlock> in = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
String spec = ec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getStringValue();
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
String[] colnames = !TfMetaUtils.isIDSpec(spec) ? in.lookup(1L).get(0).getColumnNames() : null;
// step 1: build transform meta data
Encoder encoderBuild = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int) fo.getNumColumns(), null);
MaxLongAccumulator accMax = registerMaxLongAccumulator(sec.getSparkContext());
JavaRDD<String> rcMaps = in.mapPartitionsToPair(new TransformEncodeBuildFunction(encoderBuild)).distinct().groupByKey().flatMap(new TransformEncodeGroupFunction(accMax));
if (containsMVImputeEncoder(encoderBuild)) {
EncoderMVImpute mva = getMVImputeEncoder(encoderBuild);
rcMaps = rcMaps.union(in.mapPartitionsToPair(new TransformEncodeBuild2Function(mva)).groupByKey().flatMap(new TransformEncodeGroup2Function(mva)));
}
// trigger eval
rcMaps.saveAsTextFile(fometa.getFileName());
// consolidate meta data frame (reuse multi-threaded reader, special handling missing values)
FrameReader reader = FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo);
FrameBlock meta = reader.readFrameFromHDFS(fometa.getFileName(), accMax.value(), fo.getNumColumns());
// recompute num distinct items per column
meta.recomputeColumnCardinality();
meta.setColumnNames((colnames != null) ? colnames : meta.getColumnNames());
// step 2: transform apply (similar to spark transformapply)
// compute omit offset map for block shifts
TfOffsetMap omap = null;
if (TfMetaUtils.containsOmitSpec(spec, colnames)) {
omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(new RDDTransformApplyOffsetFunction(spec, colnames)).collect()));
}
// create encoder broadcast (avoiding replication per task)
Encoder encoder = EncoderFactory.createEncoder(spec, colnames, fo.getSchema(), (int) fo.getNumColumns(), meta);
mcOut.setDimension(mcIn.getRows() - ((omap != null) ? omap.getNumRmRows() : 0), encoder.getNumCols());
Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
Broadcast<TfOffsetMap> bomap = (omap != null) ? sec.getSparkContext().broadcast(omap) : null;
// execute transform apply
JavaPairRDD<Long, FrameBlock> tmp = in.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
JavaPairRDD<MatrixIndexes, MatrixBlock> out = FrameRDDConverterUtils.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
// set output and maintain lineage/output characteristics
sec.setRDDHandleForVariable(_outputs.get(0).getName(), out);
sec.addLineageRDD(_outputs.get(0).getName(), input1.getName());
sec.setFrameOutput(_outputs.get(1).getName(), meta);
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.
the class PMapmmSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// get inputs
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(input1.getName());
// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
// cache right hand side because accessed many times
in2 = in2.repartition(sec.getSparkContext().defaultParallelism()).persist(pmapmmStorageLevel);
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
for (int i = 0; i < mc1.getRows(); i += NUM_ROWBLOCKS * mc1.getRowsPerBlock()) {
// create broadcast for rdd partition
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = in1.filter(new IsBlockInRange(i + 1, i + NUM_ROWBLOCKS * mc1.getRowsPerBlock(), 1, mc1.getCols(), mc1)).mapToPair(new PMapMMRebaseBlocksFunction(i / mc1.getRowsPerBlock()));
int rlen = (int) Math.min(mc1.getRows() - i, NUM_ROWBLOCKS * mc1.getRowsPerBlock());
PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int) mc1.getCols(), mc1.getRowsPerBlock(), mc1.getColsPerBlock(), -1L);
Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
// matrix multiplication
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd2 = in2.flatMapToPair(new PMapMMFunction(bpmb, i / mc1.getRowsPerBlock()));
rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
rdd2.persist(pmapmmStorageLevel).count();
bpmb.unpersist(false);
if (out == null)
out = rdd2;
else
out = out.union(rdd2);
}
// cache final result
out = out.persist(pmapmmStorageLevel);
out.count();
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
sec.addLineageRDD(output.getName(), input2.getName());
// update output statistics if not inferred
updateBinaryMMOutputMatrixCharacteristics(sec, true);
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.
the class QuantileSortSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
boolean weighted = (input2 != null);
// get input rdds
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> inW = weighted ? sec.getBinaryBlockRDDHandleForVariable(input2.getName()) : null;
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
long clen = -1;
if (!weighted) {
// W/O WEIGHTS (default)
out = RDDSortUtils.sortByVal(in, mc.getRows(), mc.getRowsPerBlock());
clen = 1;
} else {
// W/ WEIGHTS
out = RDDSortUtils.sortByVal(in, inW, mc.getRows(), mc.getRowsPerBlock());
clen = 2;
}
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
if (weighted)
sec.addLineageRDD(output.getName(), input2.getName());
// update output matrix characteristics
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
mcOut.set(mc.getRows(), clen, mc.getRowsPerBlock(), mc.getColsPerBlock());
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.
the class QuaternarySPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
QuaternaryOperator qop = (QuaternaryOperator) _optr;
// tracking of rdds and broadcasts (for lineage maintenance)
ArrayList<String> rddVars = new ArrayList<>();
ArrayList<String> bcVars = new ArrayList<>();
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
MatrixCharacteristics inMc = sec.getMatrixCharacteristics(input1.getName());
long rlen = inMc.getRows();
long clen = inMc.getCols();
int brlen = inMc.getRowsPerBlock();
int bclen = inMc.getColsPerBlock();
// (map/redwsloss, map/redwcemm); safe because theses ops produce a scalar
if (qop.wtype1 != null || qop.wtype4 != null) {
in = in.filter(new FilterNonEmptyBlocksFunction());
}
// map-side only operation (one rdd input, two broadcasts)
if (WeightedSquaredLoss.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedSigmoid.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedDivMM.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedCrossEntropy.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedUnaryMM.OPCODE.equalsIgnoreCase(getOpcode())) {
PartitionedBroadcast<MatrixBlock> bc1 = sec.getBroadcastForVariable(input2.getName());
PartitionedBroadcast<MatrixBlock> bc2 = sec.getBroadcastForVariable(input3.getName());
// partitioning-preserving mappartitions (key access required for broadcast loopkup)
// only wdivmm changes keys
boolean noKeyChange = (qop.wtype3 == null || qop.wtype3.isBasic());
out = in.mapPartitionsToPair(new RDDQuaternaryFunction1(qop, bc1, bc2), noKeyChange);
rddVars.add(input1.getName());
bcVars.add(input2.getName());
bcVars.add(input3.getName());
} else // reduce-side operation (two/three/four rdd inputs, zero/one/two broadcasts)
{
PartitionedBroadcast<MatrixBlock> bc1 = _cacheU ? sec.getBroadcastForVariable(input2.getName()) : null;
PartitionedBroadcast<MatrixBlock> bc2 = _cacheV ? sec.getBroadcastForVariable(input3.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inU = (!_cacheU) ? sec.getBinaryBlockRDDHandleForVariable(input2.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inV = (!_cacheV) ? sec.getBinaryBlockRDDHandleForVariable(input3.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inW = (qop.hasFourInputs() && !_input4.isLiteral()) ? sec.getBinaryBlockRDDHandleForVariable(_input4.getName()) : null;
// preparation of transposed and replicated U
if (inU != null)
inU = inU.flatMapToPair(new ReplicateBlockFunction(clen, bclen, true));
// preparation of transposed and replicated V
if (inV != null)
inV = inV.mapToPair(new TransposeFactorIndexesFunction()).flatMapToPair(new ReplicateBlockFunction(rlen, brlen, false));
// functions calls w/ two rdd inputs
if (inU != null && inV == null && inW == null)
out = in.join(inU).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else if (inU == null && inV != null && inW == null)
out = in.join(inV).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else if (inU == null && inV == null && inW != null)
out = in.join(inW).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else // function calls w/ three rdd inputs
if (inU != null && inV != null && inW == null)
out = in.join(inU).join(inV).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU != null && inV == null && inW != null)
out = in.join(inU).join(inW).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU == null && inV != null && inW != null)
out = in.join(inV).join(inW).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU == null && inV == null && inW == null) {
out = in.mapPartitionsToPair(new RDDQuaternaryFunction1(qop, bc1, bc2), false);
} else
// function call w/ four rdd inputs
// need keys in case of wdivmm
out = in.join(inU).join(inV).join(inW).mapToPair(new RDDQuaternaryFunction4(qop));
// keep variable names for lineage maintenance
if (inU == null)
bcVars.add(input2.getName());
else
rddVars.add(input2.getName());
if (inV == null)
bcVars.add(input3.getName());
else
rddVars.add(input3.getName());
if (inW != null)
rddVars.add(_input4.getName());
}
// output handling, incl aggregation
if (// map/redwsloss, map/redwcemm
qop.wtype1 != null || qop.wtype4 != null) {
// full aggregate and cast to scalar
MatrixBlock tmp = RDDAggregateUtils.sumStable(out);
DoubleObject ret = new DoubleObject(tmp.getValue(0, 0));
sec.setVariable(output.getName(), ret);
} else // map/redwsigmoid, map/redwdivmm, map/redwumm
{
// aggregation if required (map/redwdivmm)
if (qop.wtype3 != null && !qop.wtype3.isBasic())
out = RDDAggregateUtils.sumByKeyStable(out, false);
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
// maintain lineage information for output rdd
for (String rddVar : rddVars) sec.addLineageRDD(output.getName(), rddVar);
for (String bcVar : bcVars) sec.addLineageBroadcast(output.getName(), bcVar);
// update matrix characteristics
updateOutputMatrixCharacteristics(sec, qop);
}
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project systemml by apache.
the class ReorgSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
String opcode = getOpcode();
// get input rdd handle
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
if (// TRANSPOSE
opcode.equalsIgnoreCase("r'")) {
// execute transpose reorg operation
out = in1.mapToPair(new ReorgMapFunction(opcode));
} else if (// REVERSE
opcode.equalsIgnoreCase("rev")) {
// execute reverse reorg operation
out = in1.flatMapToPair(new RDDRevFunction(mcIn));
if (mcIn.getRows() % mcIn.getRowsPerBlock() != 0)
out = RDDAggregateUtils.mergeByKey(out, false);
} else if (// DIAG
opcode.equalsIgnoreCase("rdiag")) {
if (mcIn.getCols() == 1) {
// diagV2M
out = in1.flatMapToPair(new RDDDiagV2MFunction(mcIn));
} else {
// diagM2V
// execute diagM2V operation
out = in1.filter(new FilterDiagBlocksFunction()).mapToPair(new ReorgMapFunction(opcode));
}
} else if (// ORDER
opcode.equalsIgnoreCase("rsort")) {
// Sort by column 'col' in ascending/descending order and return either index/value
// get parameters
long[] cols = _col.getDataType().isMatrix() ? DataConverter.convertToLongVector(ec.getMatrixInput(_col.getName())) : new long[] { ec.getScalarInput(_col.getName(), _col.getValueType(), _col.isLiteral()).getLongValue() };
boolean desc = ec.getScalarInput(_desc.getName(), _desc.getValueType(), _desc.isLiteral()).getBooleanValue();
boolean ixret = ec.getScalarInput(_ixret.getName(), _ixret.getValueType(), _ixret.isLiteral()).getBooleanValue();
boolean singleCol = (mcIn.getCols() == 1);
out = in1;
if (cols.length > mcIn.getColsPerBlock())
LOG.warn("Unsupported sort with number of order-by columns large than blocksize: " + cols.length);
if (singleCol || cols.length == 1) {
// extract column (if necessary) and sort
if (!singleCol)
out = out.filter(new IsBlockInRange(1, mcIn.getRows(), cols[0], cols[0], mcIn)).mapValues(new ExtractColumn((int) UtilFunctions.computeCellInBlock(cols[0], mcIn.getColsPerBlock())));
// actual index/data sort operation
if (// sort indexes
ixret)
out = RDDSortUtils.sortIndexesByVal(out, !desc, mcIn.getRows(), mcIn.getRowsPerBlock());
else if (// sort single-column matrix
singleCol && !desc)
out = RDDSortUtils.sortByVal(out, mcIn.getRows(), mcIn.getRowsPerBlock());
else if (// sort multi-column matrix w/ rewrite
!_bSortIndInMem)
out = RDDSortUtils.sortDataByVal(out, in1, !desc, mcIn.getRows(), mcIn.getCols(), mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
else
// sort multi-column matrix
out = RDDSortUtils.sortDataByValMemSort(out, in1, !desc, mcIn.getRows(), mcIn.getCols(), mcIn.getRowsPerBlock(), mcIn.getColsPerBlock(), sec, (ReorgOperator) _optr);
} else {
// extract columns (if necessary)
if (cols.length < mcIn.getCols())
out = out.filter(new IsBlockInList(cols, mcIn)).mapToPair(new ExtractColumns(cols, mcIn));
// append extracted columns (if necessary)
if (mcIn.getCols() > mcIn.getColsPerBlock())
out = RDDAggregateUtils.mergeByKey(out);
// actual index/data sort operation
if (// sort indexes
ixret)
out = RDDSortUtils.sortIndexesByVals(out, !desc, mcIn.getRows(), (long) cols.length, mcIn.getRowsPerBlock());
else if (// sort single-column matrix
cols.length == mcIn.getCols() && !desc)
out = RDDSortUtils.sortByVals(out, mcIn.getRows(), cols.length, mcIn.getRowsPerBlock());
else
// sort multi-column matrix
out = RDDSortUtils.sortDataByVals(out, in1, !desc, mcIn.getRows(), mcIn.getCols(), cols.length, mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
}
} else {
throw new DMLRuntimeException("Error: Incorrect opcode in ReorgSPInstruction:" + opcode);
}
// store output rdd handle
if (opcode.equalsIgnoreCase("rsort") && _col.getDataType().isMatrix())
sec.releaseMatrixInput(_col.getName());
updateReorgMatrixCharacteristics(sec);
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
}
Aggregations