use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class QuaternarySPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
QuaternaryOperator qop = (QuaternaryOperator) _optr;
// tracking of rdds and broadcasts (for lineage maintenance)
ArrayList<String> rddVars = new ArrayList<>();
ArrayList<String> bcVars = new ArrayList<>();
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
MatrixCharacteristics inMc = sec.getMatrixCharacteristics(input1.getName());
long rlen = inMc.getRows();
long clen = inMc.getCols();
int brlen = inMc.getRowsPerBlock();
int bclen = inMc.getColsPerBlock();
// (map/redwsloss, map/redwcemm); safe because theses ops produce a scalar
if (qop.wtype1 != null || qop.wtype4 != null) {
in = in.filter(new FilterNonEmptyBlocksFunction());
}
// map-side only operation (one rdd input, two broadcasts)
if (WeightedSquaredLoss.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedSigmoid.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedDivMM.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedCrossEntropy.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedUnaryMM.OPCODE.equalsIgnoreCase(getOpcode())) {
PartitionedBroadcast<MatrixBlock> bc1 = sec.getBroadcastForVariable(input2.getName());
PartitionedBroadcast<MatrixBlock> bc2 = sec.getBroadcastForVariable(input3.getName());
// partitioning-preserving mappartitions (key access required for broadcast loopkup)
// only wdivmm changes keys
boolean noKeyChange = (qop.wtype3 == null || qop.wtype3.isBasic());
out = in.mapPartitionsToPair(new RDDQuaternaryFunction1(qop, bc1, bc2), noKeyChange);
rddVars.add(input1.getName());
bcVars.add(input2.getName());
bcVars.add(input3.getName());
} else // reduce-side operation (two/three/four rdd inputs, zero/one/two broadcasts)
{
PartitionedBroadcast<MatrixBlock> bc1 = _cacheU ? sec.getBroadcastForVariable(input2.getName()) : null;
PartitionedBroadcast<MatrixBlock> bc2 = _cacheV ? sec.getBroadcastForVariable(input3.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inU = (!_cacheU) ? sec.getBinaryBlockRDDHandleForVariable(input2.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inV = (!_cacheV) ? sec.getBinaryBlockRDDHandleForVariable(input3.getName()) : null;
JavaPairRDD<MatrixIndexes, MatrixBlock> inW = (qop.hasFourInputs() && !_input4.isLiteral()) ? sec.getBinaryBlockRDDHandleForVariable(_input4.getName()) : null;
// preparation of transposed and replicated U
if (inU != null)
inU = inU.flatMapToPair(new ReplicateBlockFunction(clen, bclen, true));
// preparation of transposed and replicated V
if (inV != null)
inV = inV.mapToPair(new TransposeFactorIndexesFunction()).flatMapToPair(new ReplicateBlockFunction(rlen, brlen, false));
// functions calls w/ two rdd inputs
if (inU != null && inV == null && inW == null)
out = in.join(inU).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else if (inU == null && inV != null && inW == null)
out = in.join(inV).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else if (inU == null && inV == null && inW != null)
out = in.join(inW).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
else // function calls w/ three rdd inputs
if (inU != null && inV != null && inW == null)
out = in.join(inU).join(inV).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU != null && inV == null && inW != null)
out = in.join(inU).join(inW).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU == null && inV != null && inW != null)
out = in.join(inV).join(inW).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
else if (inU == null && inV == null && inW == null) {
out = in.mapPartitionsToPair(new RDDQuaternaryFunction1(qop, bc1, bc2), false);
} else
// function call w/ four rdd inputs
// need keys in case of wdivmm
out = in.join(inU).join(inV).join(inW).mapToPair(new RDDQuaternaryFunction4(qop));
// keep variable names for lineage maintenance
if (inU == null)
bcVars.add(input2.getName());
else
rddVars.add(input2.getName());
if (inV == null)
bcVars.add(input3.getName());
else
rddVars.add(input3.getName());
if (inW != null)
rddVars.add(_input4.getName());
}
// output handling, incl aggregation
if (// map/redwsloss, map/redwcemm
qop.wtype1 != null || qop.wtype4 != null) {
// full aggregate and cast to scalar
MatrixBlock tmp = RDDAggregateUtils.sumStable(out);
DoubleObject ret = new DoubleObject(tmp.getValue(0, 0));
sec.setVariable(output.getName(), ret);
} else // map/redwsigmoid, map/redwdivmm, map/redwumm
{
// aggregation if required (map/redwdivmm)
if (qop.wtype3 != null && !qop.wtype3.isBasic())
out = RDDAggregateUtils.sumByKeyStable(out, false);
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
// maintain lineage information for output rdd
for (String rddVar : rddVars) sec.addLineageRDD(output.getName(), rddVar);
for (String bcVar : bcVars) sec.addLineageBroadcast(output.getName(), bcVar);
// update matrix characteristics
updateOutputMatrixCharacteristics(sec, qop);
}
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class RandSPInstruction method generateRandData.
private void generateRandData(SparkExecutionContext sec) {
long lrows = sec.getScalarInput(rows).getLongValue();
long lcols = sec.getScalarInput(cols).getLongValue();
// step 1: generate pseudo-random seed (because not specified)
// seed per invocation
long lSeed = seed;
if (lSeed == DataGenOp.UNSPECIFIED_SEED)
lSeed = DataGenOp.generateRandomSeed();
if (LOG.isTraceEnabled())
LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + ".");
// step 2: potential in-memory rand operations if applicable
if (isMemAvail(lrows, lcols, sparsity, minValue, maxValue) && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) {
RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) lrows, (int) lcols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams);
MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed);
sec.setMatrixOutput(output.getName(), mb, getExtendedOpcode());
Statistics.decrementNoOfExecutedSPInst();
return;
}
// step 3: seed generation
JavaPairRDD<MatrixIndexes, Long> seedsRDD = null;
Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed);
double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(lrows, lcols, rowsInBlock, colsInBlock, // overestimate for on disk, ensures hdfs block per partition
sparsity);
double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
MatrixCharacteristics tmp = new MatrixCharacteristics(lrows, lcols, rowsInBlock, colsInBlock);
long numBlocks = tmp.getNumBlocks();
long numColBlocks = tmp.getNumColBlocks();
// a) in-memory seed rdd construction
if (numBlocks < INMEMORY_NUMBLOCKS_THRESHOLD) {
ArrayList<Tuple2<MatrixIndexes, Long>> seeds = new ArrayList<>();
for (long i = 0; i < numBlocks; i++) {
long r = 1 + i / numColBlocks;
long c = 1 + i % numColBlocks;
MatrixIndexes indx = new MatrixIndexes(r, c);
Long seedForBlock = bigrand.nextLong();
seeds.add(new Tuple2<>(indx, seedForBlock));
}
// for load balancing: degree of parallelism such that ~128MB per partition
int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
// create seeds rdd
seedsRDD = sec.getSparkContext().parallelizePairs(seeds, numPartitions);
} else // b) file-based seed rdd construction (for robustness wrt large number of blocks)
{
Path path = new Path(LibMatrixDatagen.generateUniqueSeedPath(dir));
PrintWriter pw = null;
try {
FileSystem fs = IOUtilFunctions.getFileSystem(path);
pw = new PrintWriter(fs.create(path));
StringBuilder sb = new StringBuilder();
for (long i = 0; i < numBlocks; i++) {
sb.append(1 + i / numColBlocks);
sb.append(',');
sb.append(1 + i % numColBlocks);
sb.append(',');
sb.append(bigrand.nextLong());
pw.println(sb.toString());
sb.setLength(0);
}
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
} finally {
IOUtilFunctions.closeSilently(pw);
}
// for load balancing: degree of parallelism such that ~128MB per partition
int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
// create seeds rdd
seedsRDD = sec.getSparkContext().textFile(path.toString(), numPartitions).mapToPair(new ExtractSeedTuple());
}
// step 4: execute rand instruction over seed input
JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(lrows, lcols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams));
// step 5: output handling
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
if (!mcOut.dimsKnown(true)) {
// note: we cannot compute the nnz from sparsity because this would not reflect the
// actual number of non-zeros, except for extreme values of sparsity equals 0 or 1.
long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * lrows * lcols) : -1;
mcOut.set(lrows, lcols, rowsInBlock, colsInBlock, lnnz);
}
sec.setRDDHandleForVariable(output.getName(), out);
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class RandSPInstruction method generateSample.
/**
* Helper function to construct a sample.
*
* @param sec spark execution context
*/
private void generateSample(SparkExecutionContext sec) {
long lrows = sec.getScalarInput(rows).getLongValue();
if (maxValue < lrows && !replace)
throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue + ") can only be generated with replacement.");
if (LOG.isTraceEnabled())
LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + lrows + ", replace=" + replace + ", seed=" + seed);
// sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
double fraction = SamplingUtils.computeFractionForSampleSize((int) lrows, UtilFunctions.toLong(maxValue), replace);
Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed);
// divide the population range across numPartitions by creating SampleTasks
double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
long outputSize = MatrixBlock.estimateSizeDenseInMemory(lrows, 1);
int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize);
long partitionSize = (long) Math.ceil(maxValue / numPartitions);
ArrayList<SampleTask> offsets = new ArrayList<>();
long st = 1;
while (st <= maxValue) {
SampleTask s = new SampleTask();
s.range_start = st;
s.seed = bigrand.nextLong();
offsets.add(s);
st = st + partitionSize;
}
JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
// Construct the sample in a distributed manner
JavaRDD<Double> rdd = offsetRDD.flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize)));
// Randomize the sampled elements
JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values();
// Trim the sampled list to required size & attach matrix indexes to randomized elements
JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(lrows)).mapToPair(new Double2MatrixCell());
MatrixCharacteristics mcOut = new MatrixCharacteristics(lrows, 1, rowsInBlock, colsInBlock, lrows);
// Construct BinaryBlock representation
JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true);
sec.getMatrixCharacteristics(output.getName()).setNonZeros(lrows);
sec.setRDDHandleForVariable(output.getName(), mbRDD);
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class RandSPInstruction method generateSequence.
private void generateSequence(SparkExecutionContext sec) {
double lfrom = sec.getScalarInput(seq_from).getDoubleValue();
double lto = sec.getScalarInput(seq_to).getDoubleValue();
double lincr = sec.getScalarInput(seq_incr).getDoubleValue();
// sanity check valid increment
if (lincr == 0) {
throw new DMLRuntimeException("ERROR: While performing seq(" + lfrom + "," + lto + "," + lincr + ")");
}
// handle default 1 to -1 for special case of from>to
lincr = LibMatrixDatagen.updateSeqIncr(lfrom, lto, lincr);
if (LOG.isTraceEnabled())
LOG.trace("Process RandSPInstruction seq with seqFrom=" + lfrom + ", seqTo=" + lto + ", seqIncr" + lincr);
// step 1: offset generation
JavaRDD<Double> offsetsRDD = null;
long nnz = UtilFunctions.getSeqLength(lfrom, lto, lincr);
double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(nnz, 1, rowsInBlock, colsInBlock, // overestimate for on disk, ensures hdfs block per partition
nnz);
double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
long numBlocks = (long) Math.ceil(((double) nnz) / rowsInBlock);
// a) in-memory offset rdd construction
if (numBlocks < INMEMORY_NUMBLOCKS_THRESHOLD) {
ArrayList<Double> offsets = new ArrayList<>();
for (long i = 0; i < numBlocks; i++) {
double off = lfrom + lincr * i * rowsInBlock;
offsets.add(off);
}
// for load balancing: degree of parallelism such that ~128MB per partition
int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
// create offset rdd
offsetsRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
} else // b) file-based offset rdd construction (for robustness wrt large number of blocks)
{
Path path = new Path(LibMatrixDatagen.generateUniqueSeedPath(dir));
PrintWriter pw = null;
try {
FileSystem fs = IOUtilFunctions.getFileSystem(path);
pw = new PrintWriter(fs.create(path));
for (long i = 0; i < numBlocks; i++) {
double off = lfrom + lincr * i * rowsInBlock;
pw.println(off);
}
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
} finally {
IOUtilFunctions.closeSilently(pw);
}
// for load balancing: degree of parallelism such that ~128MB per partition
int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
// create seeds rdd
offsetsRDD = sec.getSparkContext().textFile(path.toString(), numPartitions).map(new ExtractOffsetTuple());
}
// step 2: execute seq instruction over offset input
JavaPairRDD<MatrixIndexes, MatrixBlock> out = offsetsRDD.mapToPair(new GenerateSequenceBlock(rowsInBlock, lfrom, lto, lincr));
// step 3: output handling
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
if (!mcOut.dimsKnown()) {
mcOut.set(nnz, 1, rowsInBlock, colsInBlock, nnz);
}
sec.setRDDHandleForVariable(output.getName(), out);
}
use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.
the class ReblockSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// set the output characteristics
CacheableData<?> obj = sec.getCacheableData(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
mcOut.set(mc.getRows(), mc.getCols(), brlen, bclen, mc.getNonZeros());
// get the source format form the meta data
MetaDataFormat iimd = (MetaDataFormat) obj.getMetaData();
if (iimd == null)
throw new DMLRuntimeException("Error: Metadata not found");
InputInfo iinfo = iimd.getInputInfo();
// check for in-memory reblock (w/ lazy spark context, potential for latency reduction)
if (Recompiler.checkCPReblock(sec, input1.getName())) {
if (input1.getDataType() == DataType.MATRIX)
Recompiler.executeInMemoryMatrixReblock(sec, input1.getName(), output.getName());
else if (input1.getDataType() == DataType.FRAME)
Recompiler.executeInMemoryFrameReblock(sec, input1.getName(), output.getName());
return;
}
// execute matrix/frame reblock
if (input1.getDataType() == DataType.MATRIX)
processMatrixReblockInstruction(sec, iinfo);
else if (input1.getDataType() == DataType.FRAME)
processFrameReblockInstruction(sec, iinfo);
}
Aggregations