Search in sources :

Example 46 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class QuaternarySPInstruction method processInstruction.

public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    QuaternaryOperator qop = (QuaternaryOperator) _optr;
    // tracking of rdds and broadcasts (for lineage maintenance)
    ArrayList<String> rddVars = new ArrayList<>();
    ArrayList<String> bcVars = new ArrayList<>();
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
    MatrixCharacteristics inMc = sec.getMatrixCharacteristics(input1.getName());
    long rlen = inMc.getRows();
    long clen = inMc.getCols();
    int brlen = inMc.getRowsPerBlock();
    int bclen = inMc.getColsPerBlock();
    // (map/redwsloss, map/redwcemm); safe because theses ops produce a scalar
    if (qop.wtype1 != null || qop.wtype4 != null) {
        in = in.filter(new FilterNonEmptyBlocksFunction());
    // map-side only operation (one rdd input, two broadcasts)
    if (WeightedSquaredLoss.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedSigmoid.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedDivMM.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedCrossEntropy.OPCODE.equalsIgnoreCase(getOpcode()) || WeightedUnaryMM.OPCODE.equalsIgnoreCase(getOpcode())) {
        PartitionedBroadcast<MatrixBlock> bc1 = sec.getBroadcastForVariable(input2.getName());
        PartitionedBroadcast<MatrixBlock> bc2 = sec.getBroadcastForVariable(input3.getName());
        // partitioning-preserving mappartitions (key access required for broadcast loopkup)
        // only wdivmm changes keys
        boolean noKeyChange = (qop.wtype3 == null || qop.wtype3.isBasic());
        out = in.mapPartitionsToPair(new RDDQuaternaryFunction1(qop, bc1, bc2), noKeyChange);
    } else // reduce-side operation (two/three/four rdd inputs, zero/one/two broadcasts)
        PartitionedBroadcast<MatrixBlock> bc1 = _cacheU ? sec.getBroadcastForVariable(input2.getName()) : null;
        PartitionedBroadcast<MatrixBlock> bc2 = _cacheV ? sec.getBroadcastForVariable(input3.getName()) : null;
        JavaPairRDD<MatrixIndexes, MatrixBlock> inU = (!_cacheU) ? sec.getBinaryBlockRDDHandleForVariable(input2.getName()) : null;
        JavaPairRDD<MatrixIndexes, MatrixBlock> inV = (!_cacheV) ? sec.getBinaryBlockRDDHandleForVariable(input3.getName()) : null;
        JavaPairRDD<MatrixIndexes, MatrixBlock> inW = (qop.hasFourInputs() && !_input4.isLiteral()) ? sec.getBinaryBlockRDDHandleForVariable(_input4.getName()) : null;
        // preparation of transposed and replicated U
        if (inU != null)
            inU = inU.flatMapToPair(new ReplicateBlockFunction(clen, bclen, true));
        // preparation of transposed and replicated V
        if (inV != null)
            inV = inV.mapToPair(new TransposeFactorIndexesFunction()).flatMapToPair(new ReplicateBlockFunction(rlen, brlen, false));
        // functions calls w/ two rdd inputs
        if (inU != null && inV == null && inW == null)
            out = in.join(inU).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
        else if (inU == null && inV != null && inW == null)
            out = in.join(inV).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
        else if (inU == null && inV == null && inW != null)
            out = in.join(inW).mapToPair(new RDDQuaternaryFunction2(qop, bc1, bc2));
        else // function calls w/ three rdd inputs
        if (inU != null && inV != null && inW == null)
            out = in.join(inU).join(inV).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
        else if (inU != null && inV == null && inW != null)
            out = in.join(inU).join(inW).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
        else if (inU == null && inV != null && inW != null)
            out = in.join(inV).join(inW).mapToPair(new RDDQuaternaryFunction3(qop, bc1, bc2));
        else if (inU == null && inV == null && inW == null) {
            out = in.mapPartitionsToPair(new RDDQuaternaryFunction1(qop, bc1, bc2), false);
        } else
            // function call w/ four rdd inputs
            // need keys in case of wdivmm
            out = in.join(inU).join(inV).join(inW).mapToPair(new RDDQuaternaryFunction4(qop));
        // keep variable names for lineage maintenance
        if (inU == null)
        if (inV == null)
        if (inW != null)
    // output handling, incl aggregation
    if (// map/redwsloss, map/redwcemm
    qop.wtype1 != null || qop.wtype4 != null) {
        // full aggregate and cast to scalar
        MatrixBlock tmp = RDDAggregateUtils.sumStable(out);
        DoubleObject ret = new DoubleObject(tmp.getValue(0, 0));
        sec.setVariable(output.getName(), ret);
    } else // map/redwsigmoid, map/redwdivmm, map/redwumm
        // aggregation if required (map/redwdivmm)
        if (qop.wtype3 != null && !qop.wtype3.isBasic())
            out = RDDAggregateUtils.sumByKeyStable(out, false);
        // put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        // maintain lineage information for output rdd
        for (String rddVar : rddVars) sec.addLineageRDD(output.getName(), rddVar);
        for (String bcVar : bcVars) sec.addLineageBroadcast(output.getName(), bcVar);
        // update matrix characteristics
        updateOutputMatrixCharacteristics(sec, qop);
Also used : QuaternaryOperator(org.apache.sysml.runtime.matrix.operators.QuaternaryOperator) FilterNonEmptyBlocksFunction(org.apache.sysml.runtime.instructions.spark.functions.FilterNonEmptyBlocksFunction) MatrixBlock( MatrixIndexes( DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) ArrayList(java.util.ArrayList) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) ReplicateBlockFunction(org.apache.sysml.runtime.instructions.spark.functions.ReplicateBlockFunction)

Example 47 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class RandSPInstruction method generateRandData.

private void generateRandData(SparkExecutionContext sec) {
    long lrows = sec.getScalarInput(rows).getLongValue();
    long lcols = sec.getScalarInput(cols).getLongValue();
    // step 1: generate pseudo-random seed (because not specified)
    // seed per invocation
    long lSeed = seed;
    if (lSeed == DataGenOp.UNSPECIFIED_SEED)
        lSeed = DataGenOp.generateRandomSeed();
    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + ".");
    // step 2: potential in-memory rand operations if applicable
    if (isMemAvail(lrows, lcols, sparsity, minValue, maxValue) && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) {
        RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) lrows, (int) lcols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams);
        MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed);
        sec.setMatrixOutput(output.getName(), mb, getExtendedOpcode());
    // step 3: seed generation
    JavaPairRDD<MatrixIndexes, Long> seedsRDD = null;
    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed);
    double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(lrows, lcols, rowsInBlock, colsInBlock, // overestimate for on disk, ensures hdfs block per partition
    double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
    MatrixCharacteristics tmp = new MatrixCharacteristics(lrows, lcols, rowsInBlock, colsInBlock);
    long numBlocks = tmp.getNumBlocks();
    long numColBlocks = tmp.getNumColBlocks();
    // a) in-memory seed rdd construction
        ArrayList<Tuple2<MatrixIndexes, Long>> seeds = new ArrayList<>();
        for (long i = 0; i < numBlocks; i++) {
            long r = 1 + i / numColBlocks;
            long c = 1 + i % numColBlocks;
            MatrixIndexes indx = new MatrixIndexes(r, c);
            Long seedForBlock = bigrand.nextLong();
            seeds.add(new Tuple2<>(indx, seedForBlock));
        // for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
        // create seeds rdd
        seedsRDD = sec.getSparkContext().parallelizePairs(seeds, numPartitions);
    } else // b) file-based seed rdd construction (for robustness wrt large number of blocks)
        Path path = new Path(LibMatrixDatagen.generateUniqueSeedPath(dir));
        PrintWriter pw = null;
        try {
            FileSystem fs = IOUtilFunctions.getFileSystem(path);
            pw = new PrintWriter(fs.create(path));
            StringBuilder sb = new StringBuilder();
            for (long i = 0; i < numBlocks; i++) {
                sb.append(1 + i / numColBlocks);
                sb.append(1 + i % numColBlocks);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        } finally {
        // for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
        // create seeds rdd
        seedsRDD = sec.getSparkContext().textFile(path.toString(), numPartitions).mapToPair(new ExtractSeedTuple());
    // step 4: execute rand instruction over seed input
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(lrows, lcols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams));
    // step 5: output handling
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (!mcOut.dimsKnown(true)) {
        // note: we cannot compute the nnz from sparsity because this would not reflect the
        // actual number of non-zeros, except for extreme values of sparsity equals 0 or 1.
        long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * lrows * lcols) : -1;
        mcOut.set(lrows, lcols, rowsInBlock, colsInBlock, lnnz);
    sec.setRDDHandleForVariable(output.getName(), out);
Also used : RandomMatrixGenerator( Path(org.apache.hadoop.fs.Path) MatrixBlock( MatrixIndexes( ArrayList(java.util.ArrayList) IOException( MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) Well1024a(org.apache.commons.math3.random.Well1024a) PrintWriter(

Example 48 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class RandSPInstruction method generateSample.

 * Helper function to construct a sample.
 * @param sec spark execution context
private void generateSample(SparkExecutionContext sec) {
    long lrows = sec.getScalarInput(rows).getLongValue();
    if (maxValue < lrows && !replace)
        throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue + ") can only be generated with replacement.");
    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + lrows + ", replace=" + replace + ", seed=" + seed);
    // sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
    double fraction = SamplingUtils.computeFractionForSampleSize((int) lrows, UtilFunctions.toLong(maxValue), replace);
    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed);
    // divide the population range across numPartitions by creating SampleTasks
    double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
    long outputSize = MatrixBlock.estimateSizeDenseInMemory(lrows, 1);
    int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize);
    long partitionSize = (long) Math.ceil(maxValue / numPartitions);
    ArrayList<SampleTask> offsets = new ArrayList<>();
    long st = 1;
    while (st <= maxValue) {
        SampleTask s = new SampleTask();
        s.range_start = st;
        s.seed = bigrand.nextLong();
        st = st + partitionSize;
    JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
    // Construct the sample in a distributed manner
    JavaRDD<Double> rdd = offsetRDD.flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize)));
    // Randomize the sampled elements
    JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values();
    // Trim the sampled list to required size & attach matrix indexes to randomized elements
    JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(lrows)).mapToPair(new Double2MatrixCell());
    MatrixCharacteristics mcOut = new MatrixCharacteristics(lrows, 1, rowsInBlock, colsInBlock, lrows);
    // Construct BinaryBlock representation
    JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true);
    sec.setRDDHandleForVariable(output.getName(), mbRDD);
Also used : MatrixBlock( MatrixIndexes( ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) MatrixCell( Well1024a(org.apache.commons.math3.random.Well1024a)

Example 49 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class RandSPInstruction method generateSequence.

private void generateSequence(SparkExecutionContext sec) {
    double lfrom = sec.getScalarInput(seq_from).getDoubleValue();
    double lto = sec.getScalarInput(seq_to).getDoubleValue();
    double lincr = sec.getScalarInput(seq_incr).getDoubleValue();
    // sanity check valid increment
    if (lincr == 0) {
        throw new DMLRuntimeException("ERROR: While performing seq(" + lfrom + "," + lto + "," + lincr + ")");
    // handle default 1 to -1 for special case of from>to
    lincr = LibMatrixDatagen.updateSeqIncr(lfrom, lto, lincr);
    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction seq with seqFrom=" + lfrom + ", seqTo=" + lto + ", seqIncr" + lincr);
    // step 1: offset generation
    JavaRDD<Double> offsetsRDD = null;
    long nnz = UtilFunctions.getSeqLength(lfrom, lto, lincr);
    double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(nnz, 1, rowsInBlock, colsInBlock, // overestimate for on disk, ensures hdfs block per partition
    double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
    long numBlocks = (long) Math.ceil(((double) nnz) / rowsInBlock);
    // a) in-memory offset rdd construction
        ArrayList<Double> offsets = new ArrayList<>();
        for (long i = 0; i < numBlocks; i++) {
            double off = lfrom + lincr * i * rowsInBlock;
        // for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
        // create offset rdd
        offsetsRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
    } else // b) file-based offset rdd construction (for robustness wrt large number of blocks)
        Path path = new Path(LibMatrixDatagen.generateUniqueSeedPath(dir));
        PrintWriter pw = null;
        try {
            FileSystem fs = IOUtilFunctions.getFileSystem(path);
            pw = new PrintWriter(fs.create(path));
            for (long i = 0; i < numBlocks; i++) {
                double off = lfrom + lincr * i * rowsInBlock;
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        } finally {
        // for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
        // create seeds rdd
        offsetsRDD = sec.getSparkContext().textFile(path.toString(), numPartitions).map(new ExtractOffsetTuple());
    // step 2: execute seq instruction over offset input
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = offsetsRDD.mapToPair(new GenerateSequenceBlock(rowsInBlock, lfrom, lto, lincr));
    // step 3: output handling
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (!mcOut.dimsKnown()) {
        mcOut.set(nnz, 1, rowsInBlock, colsInBlock, nnz);
    sec.setRDDHandleForVariable(output.getName(), out);
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock( MatrixIndexes( ArrayList(java.util.ArrayList) IOException( DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) FileSystem(org.apache.hadoop.fs.FileSystem) PrintWriter(

Example 50 with MatrixCharacteristics

use of org.apache.sysml.runtime.matrix.MatrixCharacteristics in project incubator-systemml by apache.

the class ReblockSPInstruction method processInstruction.

public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // set the output characteristics
    CacheableData<?> obj = sec.getCacheableData(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    mcOut.set(mc.getRows(), mc.getCols(), brlen, bclen, mc.getNonZeros());
    // get the source format form the meta data
    MetaDataFormat iimd = (MetaDataFormat) obj.getMetaData();
    if (iimd == null)
        throw new DMLRuntimeException("Error: Metadata not found");
    InputInfo iinfo = iimd.getInputInfo();
    // check for in-memory reblock (w/ lazy spark context, potential for latency reduction)
    if (Recompiler.checkCPReblock(sec, input1.getName())) {
        if (input1.getDataType() == DataType.MATRIX)
            Recompiler.executeInMemoryMatrixReblock(sec, input1.getName(), output.getName());
        else if (input1.getDataType() == DataType.FRAME)
            Recompiler.executeInMemoryFrameReblock(sec, input1.getName(), output.getName());
    // execute matrix/frame reblock
    if (input1.getDataType() == DataType.MATRIX)
        processMatrixReblockInstruction(sec, iinfo);
    else if (input1.getDataType() == DataType.FRAME)
        processFrameReblockInstruction(sec, iinfo);
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) InputInfo( SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)


MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)296 MatrixBlock ( DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)89 MatrixIndexes ( TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)50 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)47 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)45 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)42 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)37 CellIndex ( IOException ( FrameBlock ( JavaPairRDD ( RDDObject ( ArrayList (java.util.ArrayList)19 ValueType (org.apache.sysml.parser.Expression.ValueType)19 Path (org.apache.hadoop.fs.Path)17 LongWritable ( Test (org.junit.Test)15 Text (