Search in sources :

Example 11 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class DataTransform method spDataTransform.

public static void spDataTransform(ParameterizedBuiltinSPInstruction inst, FrameObject[] inputs, MatrixObject[] outputs, ExecutionContext ec) throws Exception {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // Parse transform instruction (the first instruction) to obtain relevant fields
    TransformOperands oprnds = new TransformOperands(inst.getParams(), inputs[0]);
    JobConf job = new JobConf();
    FileSystem fs = IOUtilFunctions.getFileSystem(inputs[0].getFileName());
    checkIfOutputOverlapsWithTxMtd(oprnds.txMtdPath, outputs[0].getFileName(), fs);
    // find the first file in alphabetical ordering of partfiles in directory inputPath 
    String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
    // find column names and construct output header
    String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
    HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
    int numColumns = colNamesToIds.size();
    String outHeader = getOutputHeader(fs, headerLine, oprnds);
    String tmpPath = MRJobConfiguration.constructTempOutputFilename();
    // Construct RDD for input data
    @SuppressWarnings("unchecked") JavaPairRDD<LongWritable, Text> inputData = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForFrameObject(inputs[0], InputInfo.CSVInputInfo);
    JavaRDD<Tuple2<LongWritable, Text>> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();
    long numRowsTf = 0, numColumnsTf = 0;
    JavaPairRDD<Long, String> tfPairRDD = null;
    if (!oprnds.isApply) {
        // build specification file with column IDs insteadof column names
        String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // Build transformation metadata, including recode maps, bin definitions, etc.
        // Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
        String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
        numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, partOffsetsFile, oprnds.inputCSVProperties, numColumns, outHeader);
        // store the specFileWithIDs as transformation metadata
        MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
    } else {
        // enable GC on colNamesToIds
        colNamesToIds = null;
        // copy given transform metadata (applyTxPath) to specified location (txMtdPath)
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
        MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
        // path to specification file
        String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
        numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
        // Apply transformation metadata, and perform actual transformation 
        tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath, specWithIDs, tmpPath, oprnds.inputCSVProperties, numColumns, outHeader);
    }
    // copy auxiliary data (old and new header lines) from temporary location to txMtdPath
    moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
    // convert to csv output format (serialized longwritable/text)
    JavaPairRDD<LongWritable, Text> outtfPairRDD = RDDConverterUtils.stringToSerializableText(tfPairRDD);
    if (outtfPairRDD != null) {
        MatrixObject outMO = outputs[0];
        String outVar = outMO.getVarName();
        outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
        sec.addLineageRDD(outVar, inst.getParams().get("target"));
        //update output statistics (required for correctness)
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
        mcOut.setDimension(numRowsTf, numColumnsTf);
        mcOut.setNonZeros(-1);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Text(org.apache.hadoop.io.Text) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf)

Example 12 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class OptimizerRuleBased method rewriteRemoveUnnecessaryCompareMatrix.

///////
//REWRITE remove compare matrix (for result merge, needs to be invoked before setting result merge)
///
protected void rewriteRemoveUnnecessaryCompareMatrix(OptNode n, ExecutionContext ec) throws DMLRuntimeException {
    ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping().getMappedProg(n.getID())[1];
    ArrayList<String> cleanedVars = new ArrayList<String>();
    ArrayList<String> resultVars = pfpb.getResultVariables();
    String itervar = pfpb.getIterablePredicateVars()[0];
    for (String rvar : resultVars) {
        Data dat = ec.getVariable(rvar);
        if (//subject to result merge with compare
        dat instanceof MatrixObject && ((MatrixObject) dat).getNnz() != 0 && //guaranteed no conditional indexing	
        n.hasOnlySimpleChilds() && //guaranteed full matrix replace 
        rContainsResultFullReplace(n, rvar, itervar, (MatrixObject) dat) && //&& !pfsb.variablesRead().containsVariable(rvar)                  //never read variable in loop body
        !//never read variable in loop body
        rIsReadInRightIndexing(n, rvar) && ((MatrixObject) dat).getNumRows() <= Integer.MAX_VALUE && ((MatrixObject) dat).getNumColumns() <= Integer.MAX_VALUE) {
            //replace existing matrix object with empty matrix
            MatrixObject mo = (MatrixObject) dat;
            ec.cleanupMatrixObject(mo);
            ec.setMatrixOutput(rvar, new MatrixBlock((int) mo.getNumRows(), (int) mo.getNumColumns(), false));
            //keep track of cleaned result variables
            cleanedVars.add(rvar);
        }
    }
    _numEvaluatedPlans++;
    LOG.debug(getOptMode() + " OPT: rewrite 'remove unnecessary compare matrix' - result=" + (!cleanedVars.isEmpty()) + " (" + ProgramConverter.serializeStringCollection(cleanedVars) + ")");
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ArrayList(java.util.ArrayList) Data(org.apache.sysml.runtime.instructions.cp.Data) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) ParForProgramBlock(org.apache.sysml.runtime.controlprogram.ParForProgramBlock)

Example 13 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class ParameterizedBuiltinSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    String opcode = getOpcode();
    //opcode guaranteed to be a valid opcode (see parsing)
    if (opcode.equalsIgnoreCase("mapgroupedagg")) {
        //get input rdd handle
        String targetVar = params.get(Statement.GAGG_TARGET);
        String groupsVar = params.get(Statement.GAGG_GROUPS);
        JavaPairRDD<MatrixIndexes, MatrixBlock> target = sec.getBinaryBlockRDDHandleForVariable(targetVar);
        PartitionedBroadcast<MatrixBlock> groups = sec.getBroadcastForVariable(groupsVar);
        MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(targetVar);
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        CPOperand ngrpOp = new CPOperand(params.get(Statement.GAGG_NUM_GROUPS));
        int ngroups = (int) sec.getScalarInput(ngrpOp.getName(), ngrpOp.getValueType(), ngrpOp.isLiteral()).getLongValue();
        //single-block aggregation
        if (ngroups <= mc1.getRowsPerBlock() && mc1.getCols() <= mc1.getColsPerBlock()) {
            //execute map grouped aggregate
            JavaRDD<MatrixBlock> out = target.map(new RDDMapGroupedAggFunction2(groups, _optr, ngroups));
            MatrixBlock out2 = RDDAggregateUtils.sumStable(out);
            //put output block into symbol table (no lineage because single block)
            //this also includes implicit maintenance of matrix characteristics
            sec.setMatrixOutput(output.getName(), out2);
        } else //multi-block aggregation
        {
            //execute map grouped aggregate
            JavaPairRDD<MatrixIndexes, MatrixBlock> out = target.flatMapToPair(new RDDMapGroupedAggFunction(groups, _optr, ngroups, mc1.getRowsPerBlock(), mc1.getColsPerBlock()));
            out = RDDAggregateUtils.sumByKeyStable(out, false);
            //updated characteristics and handle outputs
            mcOut.set(ngroups, mc1.getCols(), mc1.getRowsPerBlock(), mc1.getColsPerBlock(), -1);
            sec.setRDDHandleForVariable(output.getName(), out);
            sec.addLineageRDD(output.getName(), targetVar);
            sec.addLineageBroadcast(output.getName(), groupsVar);
        }
    } else if (opcode.equalsIgnoreCase("groupedagg")) {
        boolean broadcastGroups = Boolean.parseBoolean(params.get("broadcast"));
        //get input rdd handle
        String groupsVar = params.get(Statement.GAGG_GROUPS);
        JavaPairRDD<MatrixIndexes, MatrixBlock> target = sec.getBinaryBlockRDDHandleForVariable(params.get(Statement.GAGG_TARGET));
        JavaPairRDD<MatrixIndexes, MatrixBlock> groups = broadcastGroups ? null : sec.getBinaryBlockRDDHandleForVariable(groupsVar);
        JavaPairRDD<MatrixIndexes, MatrixBlock> weights = null;
        MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(params.get(Statement.GAGG_TARGET));
        MatrixCharacteristics mc2 = sec.getMatrixCharacteristics(groupsVar);
        if (mc1.dimsKnown() && mc2.dimsKnown() && (mc1.getRows() != mc2.getRows() || mc2.getCols() != 1)) {
            throw new DMLRuntimeException("Grouped Aggregate dimension mismatch between target and groups.");
        }
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        JavaPairRDD<MatrixIndexes, WeightedCell> groupWeightedCells = null;
        // Step 1: First extract groupWeightedCells from group, target and weights
        if (params.get(Statement.GAGG_WEIGHTS) != null) {
            weights = sec.getBinaryBlockRDDHandleForVariable(params.get(Statement.GAGG_WEIGHTS));
            MatrixCharacteristics mc3 = sec.getMatrixCharacteristics(params.get(Statement.GAGG_WEIGHTS));
            if (mc1.dimsKnown() && mc3.dimsKnown() && (mc1.getRows() != mc3.getRows() || mc1.getCols() != mc3.getCols())) {
                throw new DMLRuntimeException("Grouped Aggregate dimension mismatch between target, groups, and weights.");
            }
            groupWeightedCells = groups.join(target).join(weights).flatMapToPair(new ExtractGroupNWeights());
        } else //input vector or matrix
        {
            String ngroupsStr = params.get(Statement.GAGG_NUM_GROUPS);
            long ngroups = (ngroupsStr != null) ? (long) Double.parseDouble(ngroupsStr) : -1;
            //execute basic grouped aggregate (extract and preagg)
            if (broadcastGroups) {
                PartitionedBroadcast<MatrixBlock> pbm = sec.getBroadcastForVariable(groupsVar);
                groupWeightedCells = target.flatMapToPair(new ExtractGroupBroadcast(pbm, mc1.getColsPerBlock(), ngroups, _optr));
            } else {
                //replicate groups if necessary
                if (mc1.getNumColBlocks() > 1) {
                    groups = groups.flatMapToPair(new ReplicateVectorFunction(false, mc1.getNumColBlocks()));
                }
                groupWeightedCells = groups.join(target).flatMapToPair(new ExtractGroupJoin(mc1.getColsPerBlock(), ngroups, _optr));
            }
        }
        // Step 2: Make sure we have brlen required while creating <MatrixIndexes, MatrixCell> 
        if (mc1.getRowsPerBlock() == -1) {
            throw new DMLRuntimeException("The block sizes are not specified for grouped aggregate");
        }
        int brlen = mc1.getRowsPerBlock();
        // Step 3: Now perform grouped aggregate operation (either on combiner side or reducer side)
        JavaPairRDD<MatrixIndexes, MatrixCell> out = null;
        if (_optr instanceof CMOperator && ((CMOperator) _optr).isPartialAggregateOperator() || _optr instanceof AggregateOperator) {
            out = groupWeightedCells.reduceByKey(new PerformGroupByAggInCombiner(_optr)).mapValues(new CreateMatrixCell(brlen, _optr));
        } else {
            // Use groupby key because partial aggregation is not supported
            out = groupWeightedCells.groupByKey().mapValues(new PerformGroupByAggInReducer(_optr)).mapValues(new CreateMatrixCell(brlen, _optr));
        }
        // Step 4: Set output characteristics and rdd handle 
        setOutputCharacteristicsForGroupedAgg(mc1, mcOut, out);
        //store output rdd handle
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), params.get(Statement.GAGG_TARGET));
        sec.addLineage(output.getName(), groupsVar, broadcastGroups);
        if (params.get(Statement.GAGG_WEIGHTS) != null) {
            sec.addLineageRDD(output.getName(), params.get(Statement.GAGG_WEIGHTS));
        }
    } else if (opcode.equalsIgnoreCase("rmempty")) {
        String rddInVar = params.get("target");
        String rddOffVar = params.get("offset");
        boolean rows = sec.getScalarInput(params.get("margin"), ValueType.STRING, true).getStringValue().equals("rows");
        long maxDim = sec.getScalarInput(params.get("maxdim"), ValueType.DOUBLE, false).getLongValue();
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(rddInVar);
        if (//default case
        maxDim > 0) {
            //get input rdd handle
            JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(rddInVar);
            JavaPairRDD<MatrixIndexes, MatrixBlock> off;
            PartitionedBroadcast<MatrixBlock> broadcastOff;
            long brlen = mcIn.getRowsPerBlock();
            long bclen = mcIn.getColsPerBlock();
            long numRep = (long) Math.ceil(rows ? (double) mcIn.getCols() / bclen : (double) mcIn.getRows() / brlen);
            //execute remove empty rows/cols operation
            JavaPairRDD<MatrixIndexes, MatrixBlock> out;
            if (_bRmEmptyBC) {
                broadcastOff = sec.getBroadcastForVariable(rddOffVar);
                // Broadcast offset vector
                out = in.flatMapToPair(new RDDRemoveEmptyFunctionInMem(rows, maxDim, brlen, bclen, broadcastOff));
            } else {
                off = sec.getBinaryBlockRDDHandleForVariable(rddOffVar);
                out = in.join(off.flatMapToPair(new ReplicateVectorFunction(!rows, numRep))).flatMapToPair(new RDDRemoveEmptyFunction(rows, maxDim, brlen, bclen));
            }
            out = RDDAggregateUtils.mergeByKey(out, false);
            //store output rdd handle
            sec.setRDDHandleForVariable(output.getName(), out);
            sec.addLineageRDD(output.getName(), rddInVar);
            if (!_bRmEmptyBC)
                sec.addLineageRDD(output.getName(), rddOffVar);
            else
                sec.addLineageBroadcast(output.getName(), rddOffVar);
            //update output statistics (required for correctness)
            MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
            mcOut.set(rows ? maxDim : mcIn.getRows(), rows ? mcIn.getCols() : maxDim, (int) brlen, (int) bclen, mcIn.getNonZeros());
        } else //special case: empty output (ensure valid dims)
        {
            MatrixBlock out = new MatrixBlock(rows ? 1 : (int) mcIn.getRows(), rows ? (int) mcIn.getCols() : 1, true);
            sec.setMatrixOutput(output.getName(), out);
        }
    } else if (opcode.equalsIgnoreCase("replace")) {
        //get input rdd handle
        String rddVar = params.get("target");
        JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(rddVar);
        //execute replace operation
        double pattern = Double.parseDouble(params.get("pattern"));
        double replacement = Double.parseDouble(params.get("replacement"));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = in1.mapValues(new RDDReplaceFunction(pattern, replacement));
        //store output rdd handle
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), rddVar);
        //update output statistics (required for correctness)
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        mcOut.set(mcIn.getRows(), mcIn.getCols(), mcIn.getRowsPerBlock(), mcIn.getColsPerBlock(), (pattern != 0 && replacement != 0) ? mcIn.getNonZeros() : -1);
    } else if (opcode.equalsIgnoreCase("rexpand")) {
        String rddInVar = params.get("target");
        //get input rdd handle
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(rddInVar);
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(rddInVar);
        double maxVal = Double.parseDouble(params.get("max"));
        long lmaxVal = UtilFunctions.toLong(maxVal);
        boolean dirRows = params.get("dir").equals("rows");
        boolean cast = Boolean.parseBoolean(params.get("cast"));
        boolean ignore = Boolean.parseBoolean(params.get("ignore"));
        long brlen = mcIn.getRowsPerBlock();
        long bclen = mcIn.getColsPerBlock();
        //repartition input vector for higher degree of parallelism 
        //(avoid scenarios where few input partitions create huge outputs)
        MatrixCharacteristics mcTmp = new MatrixCharacteristics(dirRows ? lmaxVal : mcIn.getRows(), dirRows ? mcIn.getRows() : lmaxVal, (int) brlen, (int) bclen, mcIn.getRows());
        int numParts = (int) Math.min(SparkUtils.getNumPreferredPartitions(mcTmp, in), mcIn.getNumBlocks());
        if (numParts > in.getNumPartitions() * 2)
            in = in.repartition(numParts);
        //execute rexpand rows/cols operation (no shuffle required because outputs are
        //block-aligned with the input, i.e., one input block generates n output blocks)
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.flatMapToPair(new RDDRExpandFunction(maxVal, dirRows, cast, ignore, brlen, bclen));
        //store output rdd handle
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), rddInVar);
        //update output statistics (required for correctness)
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        mcOut.set(dirRows ? lmaxVal : mcIn.getRows(), dirRows ? mcIn.getRows() : lmaxVal, (int) brlen, (int) bclen, -1);
    } else if (opcode.equalsIgnoreCase("transform")) {
        // perform data transform on Spark
        try {
            DataTransform.spDataTransform(this, new FrameObject[] { sec.getFrameObject(params.get("target")) }, new MatrixObject[] { sec.getMatrixObject(output.getName()) }, ec);
        } catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
    } else if (opcode.equalsIgnoreCase("transformapply")) {
        //get input RDD and meta data
        FrameObject fo = sec.getFrameObject(params.get("target"));
        JavaPairRDD<Long, FrameBlock> in = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
        FrameBlock meta = sec.getFrameInput(params.get("meta"));
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(params.get("target"));
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        String[] colnames = !TfMetaUtils.isIDSpecification(params.get("spec")) ? in.lookup(1L).get(0).getColumnNames() : null;
        //compute omit offset map for block shifts
        TfOffsetMap omap = null;
        if (TfMetaUtils.containsOmitSpec(params.get("spec"), colnames)) {
            omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(new RDDTransformApplyOffsetFunction(params.get("spec"), colnames)).collect()));
        }
        //create encoder broadcast (avoiding replication per task) 
        Encoder encoder = EncoderFactory.createEncoder(params.get("spec"), colnames, fo.getSchema(), (int) fo.getNumColumns(), meta);
        mcOut.setDimension(mcIn.getRows() - ((omap != null) ? omap.getNumRmRows() : 0), encoder.getNumCols());
        Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
        Broadcast<TfOffsetMap> bomap = (omap != null) ? sec.getSparkContext().broadcast(omap) : null;
        //execute transform apply
        JavaPairRDD<Long, FrameBlock> tmp = in.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = FrameRDDConverterUtils.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
        //set output and maintain lineage/output characteristics
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), params.get("target"));
        ec.releaseFrameInput(params.get("meta"));
    } else if (opcode.equalsIgnoreCase("transformdecode")) {
        //get input RDD and meta data
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(params.get("target"));
        MatrixCharacteristics mc = sec.getMatrixCharacteristics(params.get("target"));
        FrameBlock meta = sec.getFrameInput(params.get("meta"));
        String[] colnames = meta.getColumnNames();
        //reblock if necessary (clen > bclen)
        if (mc.getCols() > mc.getNumColBlocks()) {
            in = in.mapToPair(new RDDTransformDecodeExpandFunction((int) mc.getCols(), mc.getColsPerBlock()));
            in = RDDAggregateUtils.mergeByKey(in, false);
        }
        //construct decoder and decode individual matrix blocks
        Decoder decoder = DecoderFactory.createDecoder(params.get("spec"), colnames, null, meta);
        JavaPairRDD<Long, FrameBlock> out = in.mapToPair(new RDDTransformDecodeFunction(decoder, mc.getRowsPerBlock()));
        //set output and maintain lineage/output characteristics
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), params.get("target"));
        ec.releaseFrameInput(params.get("meta"));
        sec.getMatrixCharacteristics(output.getName()).set(mc.getRows(), meta.getNumColumns(), mc.getRowsPerBlock(), mc.getColsPerBlock(), -1);
        sec.getFrameObject(output.getName()).setSchema(decoder.getSchema());
    } else {
        throw new DMLRuntimeException("Unknown parameterized builtin opcode: " + opcode);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ExtractGroupNWeights(org.apache.sysml.runtime.instructions.spark.functions.ExtractGroupNWeights) ReplicateVectorFunction(org.apache.sysml.runtime.instructions.spark.functions.ReplicateVectorFunction) Decoder(org.apache.sysml.runtime.transform.decode.Decoder) PartitionedBroadcast(org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) Encoder(org.apache.sysml.runtime.transform.encode.Encoder) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) AggregateOperator(org.apache.sysml.runtime.matrix.operators.AggregateOperator) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) PerformGroupByAggInReducer(org.apache.sysml.runtime.instructions.spark.functions.PerformGroupByAggInReducer) CPOperand(org.apache.sysml.runtime.instructions.cp.CPOperand) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) ExtractGroupBroadcast(org.apache.sysml.runtime.instructions.spark.functions.ExtractGroup.ExtractGroupBroadcast) TfOffsetMap(org.apache.sysml.runtime.transform.meta.TfOffsetMap) PerformGroupByAggInCombiner(org.apache.sysml.runtime.instructions.spark.functions.PerformGroupByAggInCombiner) ExtractGroupJoin(org.apache.sysml.runtime.instructions.spark.functions.ExtractGroup.ExtractGroupJoin) CMOperator(org.apache.sysml.runtime.matrix.operators.CMOperator)

Example 14 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class ResultMergeLocalMemory method executeParallelMerge.

@Override
public MatrixObject executeParallelMerge(int par) throws DMLRuntimeException {
    //always create new matrix object (required for nested parallelism)
    MatrixObject moNew = null;
    //Timing time = null;
    LOG.trace("ResultMerge (local, in-memory): Execute parallel (par=" + par + ") merge for output " + _output.getVarName() + " (fname=" + _output.getFileName() + ")");
    try {
        //get matrix blocks through caching 
        MatrixBlock outMB = _output.acquireRead();
        ArrayList<MatrixObject> inMO = new ArrayList<MatrixObject>();
        for (MatrixObject in : _inputs) {
            //check for empty inputs (no iterations executed)
            if (in != null && in != _output)
                inMO.add(in);
        }
        if (//if there exist something to merge
        !inMO.isEmpty()) {
            //get old output matrix from cache for compare
            //NOTE: always in dense representation in order to allow for parallel unsynchronized access 
            long rows = outMB.getNumRows();
            long cols = outMB.getNumColumns();
            MatrixBlock outMBNew = new MatrixBlock((int) rows, (int) cols, false);
            outMBNew.allocateDenseBlockUnsafe((int) rows, (int) cols);
            //create compare matrix if required (existing data in result)
            _compare = createCompareMatrix(outMB);
            if (_compare != null)
                outMBNew.copy(outMB);
            //parallel merge of all inputs
            //number of inputs can be lower than par
            int numThreads = Math.min(par, inMO.size());
            //ensure robustness for remote exec
            numThreads = Math.min(numThreads, InfrastructureAnalyzer.getLocalParallelism());
            Thread[] threads = new Thread[numThreads];
            for (//multiple waves if necessary
            int k = 0; //multiple waves if necessary
            k < inMO.size(); //multiple waves if necessary
            k += numThreads) {
                //create and start threads
                for (int i = 0; i < threads.length; i++) {
                    ResultMergeWorker rmw = new ResultMergeWorker(inMO.get(k + i), outMBNew);
                    threads[i] = new Thread(rmw);
                    threads[i].setPriority(Thread.MAX_PRIORITY);
                    // start execution
                    threads[i].start();
                }
                //wait for all workers to finish
                for (int i = 0; i < threads.length; i++) {
                    threads[i].join();
                }
            }
            //create new output matrix 
            //(e.g., to prevent potential export<->read file access conflict in specific cases of 
            // local-remote nested parfor))
            moNew = createNewMatrixObject(outMBNew);
        } else {
            //return old matrix, to prevent copy
            moNew = _output;
        }
        //release old output, and all inputs
        _output.release();
    //_output.clearData(); //save, since it respects pin/unpin  
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    return moNew;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 15 with MatrixObject

use of org.apache.sysml.runtime.controlprogram.caching.MatrixObject in project incubator-systemml by apache.

the class ResultMergeRemoteSpark method executeParallelMerge.

@Override
public MatrixObject executeParallelMerge(int par) throws DMLRuntimeException {
    //always create new matrix object (required for nested parallelism)
    MatrixObject moNew = null;
    LOG.trace("ResultMerge (remote, spark): Execute serial merge for output " + _output.getVarName() + " (fname=" + _output.getFileName() + ")");
    try {
        if (_inputs != null && _inputs.length > 0) {
            //prepare compare
            MatrixFormatMetaData metadata = (MatrixFormatMetaData) _output.getMetaData();
            MatrixCharacteristics mcOld = metadata.getMatrixCharacteristics();
            MatrixObject compare = (mcOld.getNonZeros() == 0) ? null : _output;
            //actual merge
            RDDObject ro = executeMerge(compare, _inputs, _output.getVarName(), mcOld.getRows(), mcOld.getCols(), mcOld.getRowsPerBlock(), mcOld.getColsPerBlock());
            //create new output matrix (e.g., to prevent potential export<->read file access conflict
            String varName = _output.getVarName();
            ValueType vt = _output.getValueType();
            moNew = new MatrixObject(vt, _outputFName);
            moNew.setVarName(varName.contains(NAME_SUFFIX) ? varName : varName + NAME_SUFFIX);
            moNew.setDataType(DataType.MATRIX);
            OutputInfo oiOld = metadata.getOutputInfo();
            InputInfo iiOld = metadata.getInputInfo();
            MatrixCharacteristics mc = new MatrixCharacteristics(mcOld.getRows(), mcOld.getCols(), mcOld.getRowsPerBlock(), mcOld.getColsPerBlock());
            mc.setNonZeros(computeNonZeros(_output, convertToList(_inputs)));
            MatrixFormatMetaData meta = new MatrixFormatMetaData(mc, oiOld, iiOld);
            moNew.setMetaData(meta);
            moNew.setRDDHandle(ro);
        } else {
            //return old matrix, to prevent copy
            moNew = _output;
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    return moNew;
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) ValueType(org.apache.sysml.parser.Expression.ValueType) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)168 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)70 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)46 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)41 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)36 Data (org.apache.sysml.runtime.instructions.cp.Data)30 IOException (java.io.IOException)18 ArrayList (java.util.ArrayList)13 Pointer (jcuda.Pointer)12 CSRPointer (org.apache.sysml.runtime.instructions.gpu.context.CSRPointer)12 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)12 CacheableData (org.apache.sysml.runtime.controlprogram.caching.CacheableData)11 CacheException (org.apache.sysml.runtime.controlprogram.caching.CacheException)10 Path (org.apache.hadoop.fs.Path)9 Hop (org.apache.sysml.hops.Hop)9 ParForProgramBlock (org.apache.sysml.runtime.controlprogram.ParForProgramBlock)9 ScalarObject (org.apache.sysml.runtime.instructions.cp.ScalarObject)9 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)9 DataOp (org.apache.sysml.hops.DataOp)8 LiteralOp (org.apache.sysml.hops.LiteralOp)8