Search in sources :

Example 46 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class RmmSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get input rdds
    MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(input1.getName());
    MatrixCharacteristics mc2 = sec.getMatrixCharacteristics(input2.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
    MatrixCharacteristics mcOut = updateBinaryMMOutputMatrixCharacteristics(sec, true);
    // execute Spark RMM instruction
    // step 1: prepare join keys (w/ shallow replication), i/j/k
    JavaPairRDD<TripleIndexes, MatrixBlock> tmp1 = in1.flatMapToPair(new RmmReplicateFunction(mc2.getCols(), mc2.getColsPerBlock(), true));
    JavaPairRDD<TripleIndexes, MatrixBlock> tmp2 = in2.flatMapToPair(new RmmReplicateFunction(mc1.getRows(), mc1.getRowsPerBlock(), false));
    // step 2: join prepared datasets, multiply, and aggregate
    int numPartJoin = Math.max(getNumJoinPartitions(mc1, mc2), SparkExecutionContext.getDefaultParallelism(true));
    int numPartOut = SparkUtils.getNumPreferredPartitions(mcOut);
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = tmp1.join(tmp2, // join by result block
    numPartJoin).mapToPair(// do matrix multiplication
    new RmmMultiplyFunction());
    out = // aggregation per result block
    RDDAggregateUtils.sumByKeyStable(// aggregation per result block
    out, numPartOut, false);
    // put output block into symbol table (no lineage because single block)
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
    sec.addLineageRDD(output.getName(), input2.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) TripleIndexes(org.apache.sysml.runtime.matrix.data.TripleIndexes) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 47 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class TernarySPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = !input1.isMatrix() ? null : sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = !input2.isMatrix() ? null : sec.getBinaryBlockRDDHandleForVariable(input2.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in3 = !input3.isMatrix() ? null : sec.getBinaryBlockRDDHandleForVariable(input3.getName());
    MatrixBlock m1 = input1.isMatrix() ? null : new MatrixBlock(ec.getScalarInput(input1).getDoubleValue());
    MatrixBlock m2 = input2.isMatrix() ? null : new MatrixBlock(ec.getScalarInput(input2).getDoubleValue());
    MatrixBlock m3 = input3.isMatrix() ? null : new MatrixBlock(ec.getScalarInput(input3).getDoubleValue());
    TernaryOperator op = (TernaryOperator) _optr;
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
    if (input1.isMatrix() && !input2.isMatrix() && !input3.isMatrix())
        out = in1.mapValues(new TernaryFunctionMSS(op, m1, m2, m3));
    else if (!input1.isMatrix() && input2.isMatrix() && !input3.isMatrix())
        out = in2.mapValues(new TernaryFunctionSMS(op, m1, m2, m3));
    else if (!input1.isMatrix() && !input2.isMatrix() && input3.isMatrix())
        out = in3.mapValues(new TernaryFunctionSSM(op, m1, m2, m3));
    else if (input1.isMatrix() && input2.isMatrix() && !input3.isMatrix())
        out = in1.join(in2).mapValues(new TernaryFunctionMMS(op, m1, m2, m3));
    else if (input1.isMatrix() && !input2.isMatrix() && input3.isMatrix())
        out = in1.join(in3).mapValues(new TernaryFunctionMSM(op, m1, m2, m3));
    else if (!input1.isMatrix() && input2.isMatrix() && input3.isMatrix())
        out = in2.join(in3).mapValues(new TernaryFunctionSMM(op, m1, m2, m3));
    else
        // all matrices
        out = in1.join(in2).join(in3).mapValues(new TernaryFunctionMMM(op, m1, m2, m3));
    // set output RDD
    updateTernaryOutputMatrixCharacteristics(sec);
    sec.setRDDHandleForVariable(output.getName(), out);
    if (input1.isMatrix())
        sec.addLineageRDD(output.getName(), input1.getName());
    if (input2.isMatrix())
        sec.addLineageRDD(output.getName(), input2.getName());
    if (input3.isMatrix())
        sec.addLineageRDD(output.getName(), input3.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) TernaryOperator(org.apache.sysml.runtime.matrix.operators.TernaryOperator) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 48 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class Tsmm2SPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    // execute tsmm2 instruction
    // step 1: first pass of X, filter-collect-broadcast excess blocks
    JavaPairRDD<MatrixIndexes, MatrixBlock> tmp1 = in.filter(new IsBlockInRange(_type.isLeft() ? 1 : mc.getRowsPerBlock() + 1, mc.getRows(), _type.isLeft() ? mc.getColsPerBlock() + 1 : 1, mc.getCols(), mc)).mapToPair(new ShiftTSMMIndexesFunction(_type));
    PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(tmp1, (int) (_type.isLeft() ? mc.getRows() : mc.getRows() - mc.getRowsPerBlock()), (int) (_type.isLeft() ? mc.getCols() - mc.getColsPerBlock() : mc.getCols()), mc.getRowsPerBlock(), mc.getColsPerBlock(), -1L);
    Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
    // step 2: second pass of X, compute tsmm/mapmm and aggregate result blocks
    int outputDim = (int) (_type.isLeft() ? mc.getCols() : mc.getRows());
    if (OptimizerUtils.estimateSize(outputDim, outputDim) <= 32 * 1024 * 1024) {
        // default: <=32MB
        // output large blocks and reduceAll to avoid skew on combineByKey
        JavaRDD<MatrixBlock> tmp2 = in.map(new RDDTSMM2ExtFunction(bpmb, _type, outputDim, (int) mc.getRowsPerBlock()));
        MatrixBlock out = RDDAggregateUtils.sumStable(tmp2);
        // put output block into symbol table (no lineage because single block)
        // this also includes implicit maintenance of matrix characteristics
        sec.setMatrixOutput(output.getName(), out, getExtendedOpcode());
    } else {
        // output individual output blocks and aggregate by key (no action)
        JavaPairRDD<MatrixIndexes, MatrixBlock> tmp2 = in.flatMapToPair(new RDDTSMM2Function(bpmb, _type));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.sumByKeyStable(tmp2, false);
        // put output RDD handle into symbol table
        sec.getMatrixCharacteristics(output.getName()).set(outputDim, outputDim, mc.getRowsPerBlock(), mc.getColsPerBlock());
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) IsBlockInRange(org.apache.sysml.runtime.instructions.spark.functions.IsBlockInRange) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) PartitionedBlock(org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 49 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class TsmmSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    // execute tsmm instruction (always produce exactly one output block)
    // (this formulation with values() requires --conf spark.driver.maxResultSize=0)
    JavaRDD<MatrixBlock> tmp = in.map(new RDDTSMMFunction(_type));
    MatrixBlock out = RDDAggregateUtils.sumStable(tmp);
    // put output block into symbol table (no lineage because single block)
    // this also includes implicit maintenance of matrix characteristics
    sec.setMatrixOutput(output.getName(), out, getExtendedOpcode());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 50 with MatrixBlock

use of org.apache.sysml.runtime.matrix.data.MatrixBlock in project incubator-systemml by apache.

the class WriteSPInstruction method processMatrixWriteInstruction.

protected void processMatrixWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi) throws IOException {
    // get input rdd
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    if (oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo) {
        // piggyback nnz maintenance on write
        LongAccumulator aNnz = null;
        if (!mc.nnzKnown()) {
            aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }
        JavaRDD<String> header = null;
        if (oi == OutputInfo.MatrixMarketOutputInfo) {
            ArrayList<String> headerContainer = new ArrayList<>(1);
            // First output MM header
            String headerStr = "%%MatrixMarket matrix coordinate real general\n" + // output number of rows, number of columns and number of nnz
            mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros();
            headerContainer.add(headerStr);
            header = sec.getSparkContext().parallelize(headerContainer);
        }
        JavaRDD<String> ijv = RDDConverterUtils.binaryBlockToTextCell(in1, mc);
        if (header != null)
            customSaveTextFile(header.union(ijv), fname, true);
        else
            customSaveTextFile(ijv, fname, false);
        if (!mc.nnzKnown())
            mc.setNonZeros(aNnz.value());
    } else if (oi == OutputInfo.CSVOutputInfo) {
        if (mc.getRows() == 0 || mc.getCols() == 0) {
            throw new IOException("Write of matrices with zero rows or columns" + " not supported (" + mc.getRows() + "x" + mc.getCols() + ").");
        }
        LongAccumulator aNnz = null;
        // piggyback nnz computation on actual write
        if (!mc.nnzKnown()) {
            aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }
        JavaRDD<String> out = RDDConverterUtils.binaryBlockToCsv(in1, mc, (CSVFileFormatProperties) formatProperties, true);
        customSaveTextFile(out, fname, false);
        if (!mc.nnzKnown())
            mc.setNonZeros((long) aNnz.value().longValue());
    } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
        // piggyback nnz computation on actual write
        LongAccumulator aNnz = null;
        if (!mc.nnzKnown()) {
            aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
            in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
        }
        // save binary block rdd on hdfs
        in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
        if (!mc.nnzKnown())
            mc.setNonZeros((long) aNnz.value().longValue());
    } else {
        // unsupported formats: binarycell (not externalized)
        throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
    }
    // write meta data file
    MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ComputeBinaryBlockNnzFunction(org.apache.sysml.runtime.instructions.spark.functions.ComputeBinaryBlockNnzFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) JavaRDD(org.apache.spark.api.java.JavaRDD) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) LongAccumulator(org.apache.spark.util.LongAccumulator)

Aggregations

MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)459 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)142 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)111 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)102 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)48 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)48 IOException (java.io.IOException)44 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)41 ArrayList (java.util.ArrayList)40 Path (org.apache.hadoop.fs.Path)29 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)24 FileSystem (org.apache.hadoop.fs.FileSystem)23 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)23 JobConf (org.apache.hadoop.mapred.JobConf)21 Tuple2 (scala.Tuple2)19 SequenceFile (org.apache.hadoop.io.SequenceFile)17 Row (org.apache.spark.sql.Row)14 SparseBlock (org.apache.sysml.runtime.matrix.data.SparseBlock)14 TestConfiguration (org.apache.sysml.test.integration.TestConfiguration)14 IndexedMatrixValue (org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue)13