use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.
the class ReorgSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
SparkExecutionContext sec = (SparkExecutionContext) ec;
String opcode = getOpcode();
//get input rdd handle
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
if (//TRANSPOSE
opcode.equalsIgnoreCase("r'")) {
//execute transpose reorg operation
out = in1.mapToPair(new ReorgMapFunction(opcode));
} else if (//REVERSE
opcode.equalsIgnoreCase("rev")) {
//execute reverse reorg operation
out = in1.flatMapToPair(new RDDRevFunction(mcIn));
if (mcIn.getRows() % mcIn.getRowsPerBlock() != 0)
out = RDDAggregateUtils.mergeByKey(out, false);
} else if (// DIAG
opcode.equalsIgnoreCase("rdiag")) {
if (mcIn.getCols() == 1) {
// diagV2M
out = in1.flatMapToPair(new RDDDiagV2MFunction(mcIn));
} else {
// diagM2V
//execute diagM2V operation
out = in1.filter(new FilterDiagBlocksFunction()).mapToPair(new ReorgMapFunction(opcode));
}
} else if (//ORDER
opcode.equalsIgnoreCase("rsort")) {
// Sort by column 'col' in ascending/descending order and return either index/value
//get parameters
long col = ec.getScalarInput(_col.getName(), _col.getValueType(), _col.isLiteral()).getLongValue();
boolean desc = ec.getScalarInput(_desc.getName(), _desc.getValueType(), _desc.isLiteral()).getBooleanValue();
boolean ixret = ec.getScalarInput(_ixret.getName(), _ixret.getValueType(), _ixret.isLiteral()).getBooleanValue();
boolean singleCol = (mcIn.getCols() == 1);
// extract column (if necessary) and sort
out = in1;
if (!singleCol) {
out = out.filter(new IsBlockInRange(1, mcIn.getRows(), col, col, mcIn)).mapValues(new ExtractColumn((int) UtilFunctions.computeCellInBlock(col, mcIn.getColsPerBlock())));
}
//actual index/data sort operation
if (ixret) {
//sort indexes
out = RDDSortUtils.sortIndexesByVal(out, !desc, mcIn.getRows(), mcIn.getRowsPerBlock());
} else if (singleCol && !desc) {
//sort single-column matrix
out = RDDSortUtils.sortByVal(out, mcIn.getRows(), mcIn.getRowsPerBlock());
} else {
//sort multi-column matrix
if (!_bSortIndInMem)
out = RDDSortUtils.sortDataByVal(out, in1, !desc, mcIn.getRows(), mcIn.getCols(), mcIn.getRowsPerBlock(), mcIn.getColsPerBlock());
else
out = RDDSortUtils.sortDataByValMemSort(out, in1, !desc, mcIn.getRows(), mcIn.getCols(), mcIn.getRowsPerBlock(), mcIn.getColsPerBlock(), sec, (ReorgOperator) _optr);
}
} else {
throw new DMLRuntimeException("Error: Incorrect opcode in ReorgSPInstruction:" + opcode);
}
//store output rdd handle
updateReorgMatrixCharacteristics(sec);
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
}
use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.
the class QuaternarySPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
SparkExecutionContext sec = (SparkExecutionContext) ec;
QuaternaryOperator qop = (QuaternaryOperator) _optr;
//tracking of rdds and broadcasts (for lineage maintenance)
ArrayList<String> rddVars = new ArrayList<String>();
ArrayList<String> bcVars = new ArrayList<String>();
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
MatrixCharacteristics inMc = sec.getMatrixCharacteristics(input1.getName());
long rlen = inMc.getRows();
long clen = inMc.getCols();
int brlen = inMc.getRowsPerBlock();
int bclen = inMc.getColsPerBlock();
//(map/redwsloss, map/redwcemm); safe because theses ops produce a scalar
if (qop.wtype1 != null || qop.wtype4 != null) {
in = in.filter(new FilterNonEmptyBlocksFunction());
}
//map-side only operation (one rdd input, two broadcasts)
if (WeightedSquaredLoss.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedSigmoid.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedDivMM.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedCrossEntropy.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedUnaryMM.OPCODE.equalsIgnoreCase(getOpcode())) {
PartitionedBroadcast<MatrixBlock> bc1 = sec.getBroadcastForVariable(input2.getName());
PartitionedBroadcast<MatrixBlock> bc2 = sec.getBroadcastForVariable(input3.getName());
//partitioning-preserving mappartitions (key access required for broadcast loopkup)
//only wdivmm changes keys
boolean noKeyChange = (qop.wtype3 == null || qop.wtype3.isBasic());
out = in.mapPartitionsToPair(new RDDQuaternaryFunction1(qop, bc1, bc2), noKeyChange);
rddVars.add(input1.getName());
bcVars.add(input2.getName());
bcVars.add(input3.getName());
} else //reduce-side operation (two/three/four rdd inputs, zero/one/two broadcasts)
{
PartitionedBroadcast<MatrixBlock> bc1 = _cacheU ? sec.getBroadcastForVariable(input2.getName()) : null;
PartitionedBroadcast<MatrixBlock> bc2 = _cacheV ? sec.getBroadcastForVariable(input3.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inU = (!_cacheU) ? sec.getBinaryBlockRDDHandleForVariable(input2.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inV = (!_cacheV) ? sec.getBinaryBlockRDDHandleForVariable(input3.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inW = (qop.hasFourInputs() && !_input4.isLiteral()) ? sec.getBinaryBlockRDDHandleForVariable(_input4.getName()) : null;
//preparation of transposed and replicated U
if (inU != null)
inU = inU.flatMapToPair(new ReplicateBlocksFunction(clen, bclen, true));
//preparation of transposed and replicated V
if (inV != null)
inV = inV.mapToPair(new TransposeFactorIndexesFunction()).flatMapToPair(new ReplicateBlocksFunction(rlen, brlen, false));
//functions calls w/ two rdd inputs
if (inU != null && inV == null && inW == null)
out = in.join(inU).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else if (inU == null && inV != null && inW == null)
out = in.join(inV).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else if (inU == null && inV == null && inW != null)
out = in.join(inW).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else //function calls w/ three rdd inputs
if (inU != null && inV != null && inW == null)
out = in.join(inU).join(inV).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU != null && inV == null && inW != null)
out = in.join(inU).join(inW).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU == null && inV != null && inW != null)
out = in.join(inV).join(inW).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU == null && inV == null && inW == null) {
out = in.mapPartitionsToPair(new RDDQuaternaryFunction1(qop, bc1, bc2), false);
} else
//function call w/ four rdd inputs
//need keys in case of wdivmm
out = in.join(inU).join(inV).join(inW).mapToPair(new RDDQuaternaryFunction4(qop));
//keep variable names for lineage maintenance
if (inU == null)
bcVars.add(input2.getName());
else
rddVars.add(input2.getName());
if (inV == null)
bcVars.add(input3.getName());
else
rddVars.add(input3.getName());
if (inW != null)
rddVars.add(_input4.getName());
}
//output handling, incl aggregation
if (//map/redwsloss, map/redwcemm
qop.wtype1 != null || qop.wtype4 != null) {
//full aggregate and cast to scalar
MatrixBlock tmp = RDDAggregateUtils.sumStable(out);
DoubleObject ret = new DoubleObject(tmp.getValue(0, 0));
sec.setVariable(output.getName(), ret);
} else //map/redwsigmoid, map/redwdivmm, map/redwumm
{
//aggregation if required (map/redwdivmm)
if (qop.wtype3 != null && !qop.wtype3.isBasic())
out = RDDAggregateUtils.sumByKeyStable(out, false);
//put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
//maintain lineage information for output rdd
for (String rddVar : rddVars) sec.addLineageRDD(output.getName(), rddVar);
for (String bcVar : bcVars) sec.addLineageBroadcast(output.getName(), bcVar);
//update matrix characteristics
updateOutputMatrixCharacteristics(sec, qop);
}
}
use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.
the class RandSPInstruction method generateSequence.
private void generateSequence(SparkExecutionContext sec) throws DMLRuntimeException {
//sanity check valid increment
if (seq_incr == 0) {
throw new DMLRuntimeException("ERROR: While performing seq(" + seq_from + "," + seq_to + "," + seq_incr + ")");
}
//handle default 1 to -1 for special case of from>to
seq_incr = LibMatrixDatagen.updateSeqIncr(seq_from, seq_to, seq_incr);
if (LOG.isTraceEnabled())
LOG.trace("Process RandSPInstruction seq with seqFrom=" + seq_from + ", seqTo=" + seq_to + ", seqIncr" + seq_incr);
//step 1: offset generation
JavaRDD<Double> offsetsRDD = null;
long nnz = (long) Math.abs(Math.round((seq_to - seq_from) / seq_incr)) + 1;
double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(nnz, 1, rowsInBlock, colsInBlock, //overestimate for on disk, ensures hdfs block per partition
nnz);
double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
long numBlocks = (long) Math.ceil(((double) nnz) / rowsInBlock);
//a) in-memory offset rdd construction
if (numBlocks < INMEMORY_NUMBLOCKS_THRESHOLD) {
ArrayList<Double> offsets = new ArrayList<Double>();
for (long i = 0; i < numBlocks; i++) {
double off = seq_from + seq_incr * i * rowsInBlock;
offsets.add(off);
}
//for load balancing: degree of parallelism such that ~128MB per partition
int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
//create offset rdd
offsetsRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
} else //b) file-based offset rdd construction (for robustness wrt large number of blocks)
{
Path path = new Path(LibMatrixDatagen.generateUniqueSeedPath(dir));
PrintWriter pw = null;
try {
FileSystem fs = IOUtilFunctions.getFileSystem(path);
pw = new PrintWriter(fs.create(path));
for (long i = 0; i < numBlocks; i++) {
double off = seq_from + seq_incr * i * rowsInBlock;
pw.println(off);
}
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
} finally {
IOUtilFunctions.closeSilently(pw);
}
//for load balancing: degree of parallelism such that ~128MB per partition
int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
//create seeds rdd
offsetsRDD = sec.getSparkContext().textFile(path.toString(), numPartitions).map(new ExtractOffsetTuple());
}
//sanity check number of non-zeros
if (nnz != rows && rows != -1) {
throw new DMLRuntimeException("Incorrect number of non-zeros: " + nnz + " != " + rows);
}
//step 2: execute seq instruction over offset input
JavaPairRDD<MatrixIndexes, MatrixBlock> out = offsetsRDD.mapToPair(new GenerateSequenceBlock(rowsInBlock, seq_from, seq_to, seq_incr));
//step 3: output handling
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
if (!mcOut.dimsKnown()) {
mcOut.set(nnz, 1, rowsInBlock, colsInBlock, nnz);
}
sec.setRDDHandleForVariable(output.getName(), out);
}
use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.
the class RandSPInstruction method generateRandData.
private void generateRandData(SparkExecutionContext sec) throws DMLRuntimeException {
//step 1: generate pseudo-random seed (because not specified)
//seed per invocation
long lSeed = seed;
if (lSeed == DataGenOp.UNSPECIFIED_SEED)
lSeed = DataGenOp.generateRandomSeed();
if (LOG.isTraceEnabled())
LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + ".");
//step 2: potential in-memory rand operations if applicable
if (isMemAvail(rows, cols, sparsity, minValue, maxValue) && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) {
RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) rows, (int) cols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams);
MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed);
sec.setMatrixOutput(output.getName(), mb);
Statistics.decrementNoOfExecutedSPInst();
return;
}
//step 3: seed generation
JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD = null;
Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed);
LongStream nnz = LibMatrixDatagen.computeNNZperBlock(rows, cols, rowsInBlock, colsInBlock, sparsity);
PrimitiveIterator.OfLong nnzIter = nnz.iterator();
double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(rows, cols, rowsInBlock, colsInBlock, //overestimate for on disk, ensures hdfs block per partition
rows * cols * sparsity);
double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
long numBlocks = new MatrixCharacteristics(rows, cols, rowsInBlock, colsInBlock).getNumBlocks();
long numColBlocks = (long) Math.ceil((double) cols / (double) colsInBlock);
//a) in-memory seed rdd construction
if (numBlocks < INMEMORY_NUMBLOCKS_THRESHOLD) {
ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>> seeds = new ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>>();
for (long i = 0; i < numBlocks; i++) {
long r = 1 + i / numColBlocks;
long c = 1 + i % numColBlocks;
MatrixIndexes indx = new MatrixIndexes(r, c);
Long seedForBlock = bigrand.nextLong();
seeds.add(new Tuple2<MatrixIndexes, Tuple2<Long, Long>>(indx, new Tuple2<Long, Long>(seedForBlock, nnzIter.nextLong())));
}
//for load balancing: degree of parallelism such that ~128MB per partition
int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
//create seeds rdd
seedsRDD = sec.getSparkContext().parallelizePairs(seeds, numPartitions);
} else //b) file-based seed rdd construction (for robustness wrt large number of blocks)
{
Path path = new Path(LibMatrixDatagen.generateUniqueSeedPath(dir));
PrintWriter pw = null;
try {
FileSystem fs = IOUtilFunctions.getFileSystem(path);
pw = new PrintWriter(fs.create(path));
StringBuilder sb = new StringBuilder();
for (long i = 0; i < numBlocks; i++) {
sb.append(1 + i / numColBlocks);
sb.append(',');
sb.append(1 + i % numColBlocks);
sb.append(',');
sb.append(bigrand.nextLong());
sb.append(',');
sb.append(nnzIter.nextLong());
pw.println(sb.toString());
sb.setLength(0);
}
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
} finally {
IOUtilFunctions.closeSilently(pw);
}
//for load balancing: degree of parallelism such that ~128MB per partition
int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
//create seeds rdd
seedsRDD = sec.getSparkContext().textFile(path.toString(), numPartitions).mapToPair(new ExtractSeedTuple());
}
//step 4: execute rand instruction over seed input
JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(rows, cols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams));
//step 5: output handling
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
if (!mcOut.dimsKnown(true)) {
//note: we cannot compute the nnz from sparsity because this would not reflect the
//actual number of non-zeros, except for extreme values of sparsity equals 0 or 1.
long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * rows * cols) : -1;
mcOut.set(rows, cols, rowsInBlock, colsInBlock, lnnz);
}
sec.setRDDHandleForVariable(output.getName(), out);
}
use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.
the class RandSPInstruction method generateSample.
/**
* Helper function to construct a sample.
*
* @param sec spark execution context
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private void generateSample(SparkExecutionContext sec) throws DMLRuntimeException {
if (maxValue < rows && !replace)
throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue + ") can only be generated with replacement.");
if (LOG.isTraceEnabled())
LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + rows + ", replace=" + replace + ", seed=" + seed);
// sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
double fraction = SamplingUtils.computeFractionForSampleSize((int) rows, UtilFunctions.toLong(maxValue), replace);
Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed);
// divide the population range across numPartitions by creating SampleTasks
double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
long outputSize = MatrixBlock.estimateSizeDenseInMemory(rows, 1);
int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize);
long partitionSize = (long) Math.ceil(maxValue / numPartitions);
ArrayList<SampleTask> offsets = new ArrayList<SampleTask>();
long st = 1;
while (st <= maxValue) {
SampleTask s = new SampleTask();
s.range_start = st;
s.seed = bigrand.nextLong();
offsets.add(s);
st = st + partitionSize;
}
JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
// Construct the sample in a distributed manner
JavaRDD<Double> rdd = offsetRDD.flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize)));
// Randomize the sampled elements
JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values();
// Trim the sampled list to required size & attach matrix indexes to randomized elements
JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(rows)).mapToPair(new Double2MatrixCell());
MatrixCharacteristics mcOut = new MatrixCharacteristics(rows, 1, rowsInBlock, colsInBlock, rows);
// Construct BinaryBlock representation
JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true);
MatrixCharacteristics retDims = sec.getMatrixCharacteristics(output.getName());
retDims.setNonZeros(rows);
sec.setRDDHandleForVariable(output.getName(), mbRDD);
}
Aggregations