Search in sources :

Example 11 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RandSPInstruction method generateRandData.

private void generateRandData(SparkExecutionContext sec) throws DMLRuntimeException {
    //step 1: generate pseudo-random seed (because not specified) 
    //seed per invocation
    long lSeed = seed;
    if (lSeed == DataGenOp.UNSPECIFIED_SEED)
        lSeed = DataGenOp.generateRandomSeed();
    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + ".");
    //step 2: potential in-memory rand operations if applicable
    if (isMemAvail(rows, cols, sparsity, minValue, maxValue) && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) {
        RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) rows, (int) cols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams);
        MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed);
        sec.setMatrixOutput(output.getName(), mb);
        Statistics.decrementNoOfExecutedSPInst();
        return;
    }
    //step 3: seed generation 
    JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD = null;
    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed);
    LongStream nnz = LibMatrixDatagen.computeNNZperBlock(rows, cols, rowsInBlock, colsInBlock, sparsity);
    PrimitiveIterator.OfLong nnzIter = nnz.iterator();
    double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(rows, cols, rowsInBlock, colsInBlock, //overestimate for on disk, ensures hdfs block per partition
    rows * cols * sparsity);
    double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize();
    long numBlocks = new MatrixCharacteristics(rows, cols, rowsInBlock, colsInBlock).getNumBlocks();
    long numColBlocks = (long) Math.ceil((double) cols / (double) colsInBlock);
    //a) in-memory seed rdd construction 
    if (numBlocks < INMEMORY_NUMBLOCKS_THRESHOLD) {
        ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>> seeds = new ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>>();
        for (long i = 0; i < numBlocks; i++) {
            long r = 1 + i / numColBlocks;
            long c = 1 + i % numColBlocks;
            MatrixIndexes indx = new MatrixIndexes(r, c);
            Long seedForBlock = bigrand.nextLong();
            seeds.add(new Tuple2<MatrixIndexes, Tuple2<Long, Long>>(indx, new Tuple2<Long, Long>(seedForBlock, nnzIter.nextLong())));
        }
        //for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
        //create seeds rdd 
        seedsRDD = sec.getSparkContext().parallelizePairs(seeds, numPartitions);
    } else //b) file-based seed rdd construction (for robustness wrt large number of blocks)
    {
        Path path = new Path(LibMatrixDatagen.generateUniqueSeedPath(dir));
        PrintWriter pw = null;
        try {
            FileSystem fs = IOUtilFunctions.getFileSystem(path);
            pw = new PrintWriter(fs.create(path));
            StringBuilder sb = new StringBuilder();
            for (long i = 0; i < numBlocks; i++) {
                sb.append(1 + i / numColBlocks);
                sb.append(',');
                sb.append(1 + i % numColBlocks);
                sb.append(',');
                sb.append(bigrand.nextLong());
                sb.append(',');
                sb.append(nnzIter.nextLong());
                pw.println(sb.toString());
                sb.setLength(0);
            }
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        } finally {
            IOUtilFunctions.closeSilently(pw);
        }
        //for load balancing: degree of parallelism such that ~128MB per partition
        int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1);
        //create seeds rdd 
        seedsRDD = sec.getSparkContext().textFile(path.toString(), numPartitions).mapToPair(new ExtractSeedTuple());
    }
    //step 4: execute rand instruction over seed input
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(rows, cols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams));
    //step 5: output handling
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (!mcOut.dimsKnown(true)) {
        //note: we cannot compute the nnz from sparsity because this would not reflect the 
        //actual number of non-zeros, except for extreme values of sparsity equals 0 or 1.
        long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * rows * cols) : -1;
        mcOut.set(rows, cols, rowsInBlock, colsInBlock, lnnz);
    }
    sec.setRDDHandleForVariable(output.getName(), out);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) PrintWriter(java.io.PrintWriter) RandomMatrixGenerator(org.apache.sysml.runtime.matrix.data.RandomMatrixGenerator) Path(org.apache.hadoop.fs.Path) PrimitiveIterator(java.util.PrimitiveIterator) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) LongStream(java.util.stream.LongStream) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) Tuple2(scala.Tuple2) Well1024a(org.apache.commons.math3.random.Well1024a)

Example 12 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RandSPInstruction method generateSample.

/**
	 * Helper function to construct a sample.
	 * 
	 * @param sec spark execution context
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private void generateSample(SparkExecutionContext sec) throws DMLRuntimeException {
    if (maxValue < rows && !replace)
        throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue + ") can only be generated with replacement.");
    if (LOG.isTraceEnabled())
        LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + rows + ", replace=" + replace + ", seed=" + seed);
    // sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
    double fraction = SamplingUtils.computeFractionForSampleSize((int) rows, UtilFunctions.toLong(maxValue), replace);
    Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed);
    // divide the population range across numPartitions by creating SampleTasks
    double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize();
    long outputSize = MatrixBlock.estimateSizeDenseInMemory(rows, 1);
    int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize);
    long partitionSize = (long) Math.ceil(maxValue / numPartitions);
    ArrayList<SampleTask> offsets = new ArrayList<SampleTask>();
    long st = 1;
    while (st <= maxValue) {
        SampleTask s = new SampleTask();
        s.range_start = st;
        s.seed = bigrand.nextLong();
        offsets.add(s);
        st = st + partitionSize;
    }
    JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions);
    // Construct the sample in a distributed manner
    JavaRDD<Double> rdd = offsetRDD.flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize)));
    // Randomize the sampled elements
    JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values();
    // Trim the sampled list to required size & attach matrix indexes to randomized elements
    JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(rows)).mapToPair(new Double2MatrixCell());
    MatrixCharacteristics mcOut = new MatrixCharacteristics(rows, 1, rowsInBlock, colsInBlock, rows);
    // Construct BinaryBlock representation
    JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils.binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true);
    MatrixCharacteristics retDims = sec.getMatrixCharacteristics(output.getName());
    retDims.setNonZeros(rows);
    sec.setRDDHandleForVariable(output.getName(), mbRDD);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Well1024a(org.apache.commons.math3.random.Well1024a)

Example 13 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class TernarySPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    //get input rdd handle
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = null;
    JavaPairRDD<MatrixIndexes, MatrixBlock> in3 = null;
    double scalar_input2 = -1, scalar_input3 = -1;
    Ternary.OperationTypes ctableOp = Ternary.findCtableOperationByInputDataTypes(input1.getDataType(), input2.getDataType(), input3.getDataType());
    ctableOp = _isExpand ? Ternary.OperationTypes.CTABLE_EXPAND_SCALAR_WEIGHT : ctableOp;
    MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    // First get the block sizes and then set them as -1 to allow for binary cell reblock
    int brlen = mc1.getRowsPerBlock();
    int bclen = mc1.getColsPerBlock();
    JavaPairRDD<MatrixIndexes, ArrayList<MatrixBlock>> inputMBs = null;
    JavaPairRDD<MatrixIndexes, CTableMap> ctables = null;
    JavaPairRDD<MatrixIndexes, Double> bincellsNoFilter = null;
    boolean setLineage2 = false;
    boolean setLineage3 = false;
    switch(ctableOp) {
        case //(VECTOR)
        CTABLE_TRANSFORM:
            // F=ctable(A,B,W) 
            in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
            in3 = sec.getBinaryBlockRDDHandleForVariable(input3.getName());
            setLineage2 = true;
            setLineage3 = true;
            inputMBs = in1.cogroup(in2).cogroup(in3).mapToPair(new MapThreeMBIterableIntoAL());
            ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
            break;
        case //(VECTOR)
        CTABLE_EXPAND_SCALAR_WEIGHT:
            // F = ctable(seq,A) or F = ctable(seq,B,1)
            scalar_input3 = sec.getScalarInput(input3.getName(), input3.getValueType(), input3.isLiteral()).getDoubleValue();
            if (scalar_input3 == 1) {
                in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
                setLineage2 = true;
                bincellsNoFilter = in2.flatMapToPair(new ExpandScalarCtableOperation(brlen));
                break;
            }
        case //(VECTOR/MATRIX)
        CTABLE_TRANSFORM_SCALAR_WEIGHT:
            // F = ctable(A,B) or F = ctable(A,B,1)
            in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
            setLineage2 = true;
            scalar_input3 = sec.getScalarInput(input3.getName(), input3.getValueType(), input3.isLiteral()).getDoubleValue();
            inputMBs = in1.cogroup(in2).mapToPair(new MapTwoMBIterableIntoAL());
            ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
            break;
        case //(VECTOR)
        CTABLE_TRANSFORM_HISTOGRAM:
            // F=ctable(A,1) or F = ctable(A,1,1)
            scalar_input2 = sec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getDoubleValue();
            scalar_input3 = sec.getScalarInput(input3.getName(), input3.getValueType(), input3.isLiteral()).getDoubleValue();
            inputMBs = in1.mapToPair(new MapMBIntoAL());
            ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
            break;
        case //(VECTOR)
        CTABLE_TRANSFORM_WEIGHTED_HISTOGRAM:
            // F=ctable(A,1,W)
            in3 = sec.getBinaryBlockRDDHandleForVariable(input3.getName());
            setLineage3 = true;
            scalar_input2 = sec.getScalarInput(input2.getName(), input2.getValueType(), input2.isLiteral()).getDoubleValue();
            inputMBs = in1.cogroup(in3).mapToPair(new MapTwoMBIterableIntoAL());
            ctables = inputMBs.mapToPair(new PerformCTableMapSideOperation(ctableOp, scalar_input2, scalar_input3, this.instString, (SimpleOperator) _optr, _ignoreZeros));
            break;
        default:
            throw new DMLRuntimeException("Encountered an invalid ctable operation (" + ctableOp + ") while executing instruction: " + this.toString());
    }
    // Now perform aggregation on ctables to get binaryCells 
    if (bincellsNoFilter == null && ctables != null) {
        bincellsNoFilter = ctables.values().flatMapToPair(new ExtractBinaryCellsFromCTable());
        bincellsNoFilter = RDDAggregateUtils.sumCellsByKeyStable(bincellsNoFilter);
    } else if (!(bincellsNoFilter != null && ctables == null)) {
        throw new DMLRuntimeException("Incorrect ctable operation");
    }
    // handle known/unknown dimensions
    long outputDim1 = (_dim1Literal ? (long) Double.parseDouble(_outDim1) : (sec.getScalarInput(_outDim1, ValueType.DOUBLE, false)).getLongValue());
    long outputDim2 = (_dim2Literal ? (long) Double.parseDouble(_outDim2) : (sec.getScalarInput(_outDim2, ValueType.DOUBLE, false)).getLongValue());
    MatrixCharacteristics mcBinaryCells = null;
    boolean findDimensions = (outputDim1 == -1 && outputDim2 == -1);
    if (!findDimensions) {
        if ((outputDim1 == -1 && outputDim2 != -1) || (outputDim1 != -1 && outputDim2 == -1))
            throw new DMLRuntimeException("Incorrect output dimensions passed to TernarySPInstruction:" + outputDim1 + " " + outputDim2);
        else
            mcBinaryCells = new MatrixCharacteristics(outputDim1, outputDim2, brlen, bclen);
        // filtering according to given dimensions
        bincellsNoFilter = bincellsNoFilter.filter(new FilterCells(mcBinaryCells.getRows(), mcBinaryCells.getCols()));
    }
    // convert double values to matrix cell
    JavaPairRDD<MatrixIndexes, MatrixCell> binaryCells = bincellsNoFilter.mapToPair(new ConvertToBinaryCell());
    // find dimensions if necessary (w/ cache for reblock)
    if (findDimensions) {
        binaryCells = SparkUtils.cacheBinaryCellRDD(binaryCells);
        mcBinaryCells = SparkUtils.computeMatrixCharacteristics(binaryCells);
    }
    //store output rdd handle
    sec.setRDDHandleForVariable(output.getName(), binaryCells);
    mcOut.set(mcBinaryCells);
    // Since we are outputing binary cells, we set block sizes = -1
    mcOut.setRowsPerBlock(-1);
    mcOut.setColsPerBlock(-1);
    sec.addLineageRDD(output.getName(), input1.getName());
    if (setLineage2)
        sec.addLineageRDD(output.getName(), input2.getName());
    if (setLineage3)
        sec.addLineageRDD(output.getName(), input3.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) ArrayList(java.util.ArrayList) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) Ternary(org.apache.sysml.lops.Ternary) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CTableMap(org.apache.sysml.runtime.matrix.data.CTableMap)

Example 14 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class Tsmm2SPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    //get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    //execute tsmm2 instruction 
    //step 1: first pass of X, filter-collect-broadcast excess blocks 
    JavaPairRDD<MatrixIndexes, MatrixBlock> tmp1 = in.filter(new IsBlockInRange(_type.isLeft() ? 1 : mc.getRowsPerBlock() + 1, mc.getRows(), _type.isLeft() ? mc.getColsPerBlock() + 1 : 1, mc.getCols(), mc)).mapToPair(new ShiftTSMMIndexesFunction(_type));
    PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(tmp1, (int) (_type.isLeft() ? mc.getRows() : mc.getRows() - mc.getRowsPerBlock()), (int) (_type.isLeft() ? mc.getCols() - mc.getColsPerBlock() : mc.getCols()), mc.getRowsPerBlock(), mc.getColsPerBlock(), -1L);
    Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
    //step 2: second pass of X, compute tsmm/mapmm and aggregate result blocks
    int outputDim = (int) (_type.isLeft() ? mc.getCols() : mc.getRows());
    if (OptimizerUtils.estimateSize(outputDim, outputDim) <= 32 * 1024 * 1024) {
        //default: <=32MB
        //output large blocks and reduceAll to avoid skew on combineByKey
        JavaRDD<MatrixBlock> tmp2 = in.map(new RDDTSMM2ExtFunction(bpmb, _type, outputDim, (int) mc.getRowsPerBlock()));
        MatrixBlock out = RDDAggregateUtils.sumStable(tmp2);
        //put output block into symbol table (no lineage because single block)
        //this also includes implicit maintenance of matrix characteristics
        sec.setMatrixOutput(output.getName(), out);
    } else {
        //output individual output blocks and aggregate by key (no action)
        JavaPairRDD<MatrixIndexes, MatrixBlock> tmp2 = in.flatMapToPair(new RDDTSMM2Function(bpmb, _type));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.sumByKeyStable(tmp2, false);
        //put output RDD handle into symbol table
        sec.getMatrixCharacteristics(output.getName()).set(outputDim, outputDim, mc.getRowsPerBlock(), mc.getColsPerBlock());
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) IsBlockInRange(org.apache.sysml.runtime.instructions.spark.functions.IsBlockInRange) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) PartitionedBlock(org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 15 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class TsmmSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    //get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    //execute tsmm instruction (always produce exactly one output block)
    //(this formulation with values() requires --conf spark.driver.maxResultSize=0)
    JavaRDD<MatrixBlock> tmp = in.map(new RDDTSMMFunction(_type));
    MatrixBlock out = RDDAggregateUtils.sumStable(tmp);
    //put output block into symbol table (no lineage because single block)
    //this also includes implicit maintenance of matrix characteristics
    sec.setMatrixOutput(output.getName(), out);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Aggregations

MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)393 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)121 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)105 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)87 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)44 IOException (java.io.IOException)43 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)38 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)34 ArrayList (java.util.ArrayList)33 Path (org.apache.hadoop.fs.Path)25 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)23 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)21 FileSystem (org.apache.hadoop.fs.FileSystem)20 JobConf (org.apache.hadoop.mapred.JobConf)17 Tuple2 (scala.Tuple2)17 SequenceFile (org.apache.hadoop.io.SequenceFile)14 MatrixReader (org.apache.sysml.runtime.io.MatrixReader)14 TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)13 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)12 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)12