Search in sources :

Example 1 with CMOperator

use of org.apache.sysml.runtime.matrix.operators.CMOperator in project incubator-systemml by apache.

the class MVImputeAgent method mergeAndOutputTransformationMetadata.

/** 
	 * Method to merge map output transformation metadata. 
	 */
@Override
public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
    double min = Double.MAX_VALUE;
    double max = -Double.MAX_VALUE;
    int nbins = 0;
    double d;
    long totalRecordCount = 0, totalValidCount = 0;
    String mvConstReplacement = null;
    DistinctValue val = new DistinctValue();
    String w = null;
    class MeanObject {

        double mean, correction;

        long count;

        MeanObject() {
        }

        public String toString() {
            return mean + "," + correction + "," + count;
        }
    }
    ;
    HashMap<Integer, MeanObject> mapMeans = new HashMap<Integer, MeanObject>();
    HashMap<Integer, CM_COV_Object> mapVars = new HashMap<Integer, CM_COV_Object>();
    boolean isImputed = false;
    boolean isScaled = false;
    boolean isBinned = false;
    while (values.hasNext()) {
        val.reset();
        val = values.next();
        w = val.getWord();
        if (w.startsWith(MEAN_PREFIX)) {
            String[] parts = w.split("_");
            int taskID = UtilFunctions.parseToInt(parts[1]);
            MeanObject mo = mapMeans.get(taskID);
            if (mo == null)
                mo = new MeanObject();
            mo.mean = UtilFunctions.parseToDouble(parts[2].split(",")[0]);
            // check if this attribute is scaled
            String s = parts[2].split(",")[1];
            if (s.equalsIgnoreCase("scmv"))
                isScaled = isImputed = true;
            else if (s.equalsIgnoreCase("scnomv"))
                isScaled = true;
            else
                isImputed = true;
            mapMeans.put(taskID, mo);
        } else if (w.startsWith(CORRECTION_PREFIX)) {
            String[] parts = w.split("_");
            int taskID = UtilFunctions.parseToInt(parts[1]);
            MeanObject mo = mapMeans.get(taskID);
            if (mo == null)
                mo = new MeanObject();
            mo.correction = UtilFunctions.parseToDouble(parts[2]);
            mapMeans.put(taskID, mo);
        } else if (w.startsWith(CONSTANT_PREFIX)) {
            isImputed = true;
            String[] parts = w.split("_");
            mvConstReplacement = parts[1];
        } else if (w.startsWith(COUNT_PREFIX)) {
            String[] parts = w.split("_");
            int taskID = UtilFunctions.parseToInt(parts[1]);
            MeanObject mo = mapMeans.get(taskID);
            if (mo == null)
                mo = new MeanObject();
            mo.count = UtilFunctions.parseToLong(parts[2]);
            totalValidCount += mo.count;
            mapMeans.put(taskID, mo);
        } else if (w.startsWith(TOTAL_COUNT_PREFIX)) {
            String[] parts = w.split("_");
            //int taskID = UtilFunctions.parseToInt(parts[1]);
            totalRecordCount += UtilFunctions.parseToLong(parts[2]);
        } else if (w.startsWith(VARIANCE_PREFIX)) {
            isScaled = true;
            String[] parts = w.split("_");
            int taskID = UtilFunctions.parseToInt(parts[1]);
            CM_COV_Object cm = decodeCMObj(parts[2]);
            mapVars.put(taskID, cm);
        } else if (w.startsWith(BinAgent.MIN_PREFIX)) {
            isBinned = true;
            d = UtilFunctions.parseToDouble(w.substring(BinAgent.MIN_PREFIX.length()));
            if (d < min)
                min = d;
        } else if (w.startsWith(BinAgent.MAX_PREFIX)) {
            isBinned = true;
            d = UtilFunctions.parseToDouble(w.substring(BinAgent.MAX_PREFIX.length()));
            if (d > max)
                max = d;
        } else if (w.startsWith(BinAgent.NBINS_PREFIX)) {
            isBinned = true;
            nbins = (int) UtilFunctions.parseToLong(w.substring(BinAgent.NBINS_PREFIX.length()));
        } else
            throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w);
    }
    // compute global mean across all map outputs
    KahanObject gmean = new KahanObject(0, 0);
    KahanPlus kp = KahanPlus.getKahanPlusFnObject();
    long gcount = 0;
    for (MeanObject mo : mapMeans.values()) {
        gcount = gcount + mo.count;
        if (gcount > 0) {
            double delta = mo.mean - gmean._sum;
            kp.execute2(gmean, delta * mo.count / gcount);
        //_meanFn.execute2(gmean, mo.mean*mo.count, gcount);
        }
    }
    // compute global variance across all map outputs
    CM_COV_Object gcm = new CM_COV_Object();
    try {
        for (CM_COV_Object cm : mapVars.values()) gcm = (CM_COV_Object) _varFn.execute(gcm, cm);
    } catch (DMLRuntimeException e) {
        throw new IOException(e);
    }
    // If the column is imputed with a constant, then adjust min and max based the value of the constant.
    if (isImputed && isBinned && mvConstReplacement != null) {
        double cst = UtilFunctions.parseToDouble(mvConstReplacement);
        if (cst < min)
            min = cst;
        if (cst > max)
            max = cst;
    }
    // write merged metadata
    if (isImputed) {
        String imputedValue = null;
        if (mvConstReplacement != null)
            imputedValue = mvConstReplacement;
        else
            imputedValue = Double.toString(gcount == 0 ? 0.0 : gmean._sum);
        writeTfMtd(colID, imputedValue, outputDir, fs, agents);
    }
    if (isBinned) {
        double binwidth = (max - min) / nbins;
        writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents);
    }
    if (isScaled) {
        try {
            if (totalValidCount != totalRecordCount) {
                // In the presence of missing values, the variance needs to be adjusted.
                // The mean does not need to be adjusted, when mv impute method is global_mean, 
                // since missing values themselves are replaced with gmean.
                long totalMissingCount = (totalRecordCount - totalValidCount);
                int idx = isApplicable(colID);
                if (idx != -1 && _mvMethodList[idx] == MVMethod.CONSTANT)
                    _meanFn.execute(gmean, UtilFunctions.parseToDouble(_replacementList[idx]), totalRecordCount);
                _varFn.execute(gcm, gmean._sum, totalMissingCount);
            }
            double mean = (gcount == 0 ? 0.0 : gmean._sum);
            double var = gcm.getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE));
            double sdev = (mapVars.size() > 0 ? Math.sqrt(var) : -1.0);
            writeTfMtd(colID, Double.toString(mean), Double.toString(sdev), outputDir, fs, agents);
        } catch (DMLRuntimeException e) {
            throw new IOException(e);
        }
    }
}
Also used : CM_COV_Object(org.apache.sysml.runtime.instructions.cp.CM_COV_Object) HashMap(java.util.HashMap) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) KahanObject(org.apache.sysml.runtime.instructions.cp.KahanObject) KahanPlus(org.apache.sysml.runtime.functionobjects.KahanPlus) CMOperator(org.apache.sysml.runtime.matrix.operators.CMOperator)

Example 2 with CMOperator

use of org.apache.sysml.runtime.matrix.operators.CMOperator in project incubator-systemml by apache.

the class MVImputeAgent method outputTransformationMetadata.

public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException {
    try {
        if (_colList != null)
            for (int i = 0; i < _colList.length; i++) {
                int colID = _colList[i];
                double imputedValue = Double.NaN;
                KahanObject gmean = null;
                if (_mvMethodList[i] == MVMethod.GLOBAL_MEAN) {
                    gmean = _meanList[i];
                    imputedValue = _meanList[i]._sum;
                    double mean = (_countList[i] == 0 ? 0.0 : _meanList[i]._sum);
                    writeTfMtd(colID, Double.toString(mean), outputDir, fs, agents);
                } else if (_mvMethodList[i] == MVMethod.CONSTANT) {
                    writeTfMtd(colID, _replacementList[i], outputDir, fs, agents);
                    if (_isMVScaled.get(i)) {
                        imputedValue = UtilFunctions.parseToDouble(_replacementList[i]);
                        // adjust the global mean, by combining gmean with "replacement" (weight = #missing values)
                        gmean = new KahanObject(_meanList[i]._sum, _meanList[i]._correction);
                        _meanFn.execute(gmean, imputedValue, agents.getValid());
                    }
                }
                if (_isMVScaled.get(i)) {
                    double sdev = -1.0;
                    if (_mvscMethodList[i] == MVMethod.GLOBAL_MODE) {
                        // Adjust variance with missing values
                        long totalMissingCount = (agents.getValid() - _countList[i]);
                        _varFn.execute(_varList[i], imputedValue, totalMissingCount);
                        double var = _varList[i].getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE));
                        sdev = Math.sqrt(var);
                    }
                    writeTfMtd(colID, Double.toString(gmean._sum), Double.toString(sdev), outputDir, fs, agents);
                }
            }
        if (_scnomvList != null)
            for (int i = 0; i < _scnomvList.length; i++) {
                int colID = _scnomvList[i];
                double mean = (_scnomvCountList[i] == 0 ? 0.0 : _scnomvMeanList[i]._sum);
                double sdev = -1.0;
                if (_scnomvMethodList[i] == MVMethod.GLOBAL_MODE) {
                    double var = _scnomvVarList[i].getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE));
                    sdev = Math.sqrt(var);
                }
                writeTfMtd(colID, Double.toString(mean), Double.toString(sdev), outputDir, fs, agents);
            }
    } catch (DMLRuntimeException e) {
        throw new IOException(e);
    }
}
Also used : KahanObject(org.apache.sysml.runtime.instructions.cp.KahanObject) IOException(java.io.IOException) CMOperator(org.apache.sysml.runtime.matrix.operators.CMOperator) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 3 with CMOperator

use of org.apache.sysml.runtime.matrix.operators.CMOperator in project incubator-systemml by apache.

the class ParameterizedBuiltinSPInstruction method processInstruction.

@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    String opcode = getOpcode();
    //opcode guaranteed to be a valid opcode (see parsing)
    if (opcode.equalsIgnoreCase("mapgroupedagg")) {
        //get input rdd handle
        String targetVar = params.get(Statement.GAGG_TARGET);
        String groupsVar = params.get(Statement.GAGG_GROUPS);
        JavaPairRDD<MatrixIndexes, MatrixBlock> target = sec.getBinaryBlockRDDHandleForVariable(targetVar);
        PartitionedBroadcast<MatrixBlock> groups = sec.getBroadcastForVariable(groupsVar);
        MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(targetVar);
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        CPOperand ngrpOp = new CPOperand(params.get(Statement.GAGG_NUM_GROUPS));
        int ngroups = (int) sec.getScalarInput(ngrpOp.getName(), ngrpOp.getValueType(), ngrpOp.isLiteral()).getLongValue();
        //single-block aggregation
        if (ngroups <= mc1.getRowsPerBlock() && mc1.getCols() <= mc1.getColsPerBlock()) {
            //execute map grouped aggregate
            JavaRDD<MatrixBlock> out = target.map(new RDDMapGroupedAggFunction2(groups, _optr, ngroups));
            MatrixBlock out2 = RDDAggregateUtils.sumStable(out);
            //put output block into symbol table (no lineage because single block)
            //this also includes implicit maintenance of matrix characteristics
            sec.setMatrixOutput(output.getName(), out2);
        } else //multi-block aggregation
        {
            //execute map grouped aggregate
            JavaPairRDD<MatrixIndexes, MatrixBlock> out = target.flatMapToPair(new RDDMapGroupedAggFunction(groups, _optr, ngroups, mc1.getRowsPerBlock(), mc1.getColsPerBlock()));
            out = RDDAggregateUtils.sumByKeyStable(out, false);
            //updated characteristics and handle outputs
            mcOut.set(ngroups, mc1.getCols(), mc1.getRowsPerBlock(), mc1.getColsPerBlock(), -1);
            sec.setRDDHandleForVariable(output.getName(), out);
            sec.addLineageRDD(output.getName(), targetVar);
            sec.addLineageBroadcast(output.getName(), groupsVar);
        }
    } else if (opcode.equalsIgnoreCase("groupedagg")) {
        boolean broadcastGroups = Boolean.parseBoolean(params.get("broadcast"));
        //get input rdd handle
        String groupsVar = params.get(Statement.GAGG_GROUPS);
        JavaPairRDD<MatrixIndexes, MatrixBlock> target = sec.getBinaryBlockRDDHandleForVariable(params.get(Statement.GAGG_TARGET));
        JavaPairRDD<MatrixIndexes, MatrixBlock> groups = broadcastGroups ? null : sec.getBinaryBlockRDDHandleForVariable(groupsVar);
        JavaPairRDD<MatrixIndexes, MatrixBlock> weights = null;
        MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(params.get(Statement.GAGG_TARGET));
        MatrixCharacteristics mc2 = sec.getMatrixCharacteristics(groupsVar);
        if (mc1.dimsKnown() && mc2.dimsKnown() && (mc1.getRows() != mc2.getRows() || mc2.getCols() != 1)) {
            throw new DMLRuntimeException("Grouped Aggregate dimension mismatch between target and groups.");
        }
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        JavaPairRDD<MatrixIndexes, WeightedCell> groupWeightedCells = null;
        // Step 1: First extract groupWeightedCells from group, target and weights
        if (params.get(Statement.GAGG_WEIGHTS) != null) {
            weights = sec.getBinaryBlockRDDHandleForVariable(params.get(Statement.GAGG_WEIGHTS));
            MatrixCharacteristics mc3 = sec.getMatrixCharacteristics(params.get(Statement.GAGG_WEIGHTS));
            if (mc1.dimsKnown() && mc3.dimsKnown() && (mc1.getRows() != mc3.getRows() || mc1.getCols() != mc3.getCols())) {
                throw new DMLRuntimeException("Grouped Aggregate dimension mismatch between target, groups, and weights.");
            }
            groupWeightedCells = groups.join(target).join(weights).flatMapToPair(new ExtractGroupNWeights());
        } else //input vector or matrix
        {
            String ngroupsStr = params.get(Statement.GAGG_NUM_GROUPS);
            long ngroups = (ngroupsStr != null) ? (long) Double.parseDouble(ngroupsStr) : -1;
            //execute basic grouped aggregate (extract and preagg)
            if (broadcastGroups) {
                PartitionedBroadcast<MatrixBlock> pbm = sec.getBroadcastForVariable(groupsVar);
                groupWeightedCells = target.flatMapToPair(new ExtractGroupBroadcast(pbm, mc1.getColsPerBlock(), ngroups, _optr));
            } else {
                //replicate groups if necessary
                if (mc1.getNumColBlocks() > 1) {
                    groups = groups.flatMapToPair(new ReplicateVectorFunction(false, mc1.getNumColBlocks()));
                }
                groupWeightedCells = groups.join(target).flatMapToPair(new ExtractGroupJoin(mc1.getColsPerBlock(), ngroups, _optr));
            }
        }
        // Step 2: Make sure we have brlen required while creating <MatrixIndexes, MatrixCell> 
        if (mc1.getRowsPerBlock() == -1) {
            throw new DMLRuntimeException("The block sizes are not specified for grouped aggregate");
        }
        int brlen = mc1.getRowsPerBlock();
        // Step 3: Now perform grouped aggregate operation (either on combiner side or reducer side)
        JavaPairRDD<MatrixIndexes, MatrixCell> out = null;
        if (_optr instanceof CMOperator && ((CMOperator) _optr).isPartialAggregateOperator() || _optr instanceof AggregateOperator) {
            out = groupWeightedCells.reduceByKey(new PerformGroupByAggInCombiner(_optr)).mapValues(new CreateMatrixCell(brlen, _optr));
        } else {
            // Use groupby key because partial aggregation is not supported
            out = groupWeightedCells.groupByKey().mapValues(new PerformGroupByAggInReducer(_optr)).mapValues(new CreateMatrixCell(brlen, _optr));
        }
        // Step 4: Set output characteristics and rdd handle 
        setOutputCharacteristicsForGroupedAgg(mc1, mcOut, out);
        //store output rdd handle
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), params.get(Statement.GAGG_TARGET));
        sec.addLineage(output.getName(), groupsVar, broadcastGroups);
        if (params.get(Statement.GAGG_WEIGHTS) != null) {
            sec.addLineageRDD(output.getName(), params.get(Statement.GAGG_WEIGHTS));
        }
    } else if (opcode.equalsIgnoreCase("rmempty")) {
        String rddInVar = params.get("target");
        String rddOffVar = params.get("offset");
        boolean rows = sec.getScalarInput(params.get("margin"), ValueType.STRING, true).getStringValue().equals("rows");
        long maxDim = sec.getScalarInput(params.get("maxdim"), ValueType.DOUBLE, false).getLongValue();
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(rddInVar);
        if (//default case
        maxDim > 0) {
            //get input rdd handle
            JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(rddInVar);
            JavaPairRDD<MatrixIndexes, MatrixBlock> off;
            PartitionedBroadcast<MatrixBlock> broadcastOff;
            long brlen = mcIn.getRowsPerBlock();
            long bclen = mcIn.getColsPerBlock();
            long numRep = (long) Math.ceil(rows ? (double) mcIn.getCols() / bclen : (double) mcIn.getRows() / brlen);
            //execute remove empty rows/cols operation
            JavaPairRDD<MatrixIndexes, MatrixBlock> out;
            if (_bRmEmptyBC) {
                broadcastOff = sec.getBroadcastForVariable(rddOffVar);
                // Broadcast offset vector
                out = in.flatMapToPair(new RDDRemoveEmptyFunctionInMem(rows, maxDim, brlen, bclen, broadcastOff));
            } else {
                off = sec.getBinaryBlockRDDHandleForVariable(rddOffVar);
                out = in.join(off.flatMapToPair(new ReplicateVectorFunction(!rows, numRep))).flatMapToPair(new RDDRemoveEmptyFunction(rows, maxDim, brlen, bclen));
            }
            out = RDDAggregateUtils.mergeByKey(out, false);
            //store output rdd handle
            sec.setRDDHandleForVariable(output.getName(), out);
            sec.addLineageRDD(output.getName(), rddInVar);
            if (!_bRmEmptyBC)
                sec.addLineageRDD(output.getName(), rddOffVar);
            else
                sec.addLineageBroadcast(output.getName(), rddOffVar);
            //update output statistics (required for correctness)
            MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
            mcOut.set(rows ? maxDim : mcIn.getRows(), rows ? mcIn.getCols() : maxDim, (int) brlen, (int) bclen, mcIn.getNonZeros());
        } else //special case: empty output (ensure valid dims)
        {
            MatrixBlock out = new MatrixBlock(rows ? 1 : (int) mcIn.getRows(), rows ? (int) mcIn.getCols() : 1, true);
            sec.setMatrixOutput(output.getName(), out);
        }
    } else if (opcode.equalsIgnoreCase("replace")) {
        //get input rdd handle
        String rddVar = params.get("target");
        JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(rddVar);
        //execute replace operation
        double pattern = Double.parseDouble(params.get("pattern"));
        double replacement = Double.parseDouble(params.get("replacement"));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = in1.mapValues(new RDDReplaceFunction(pattern, replacement));
        //store output rdd handle
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), rddVar);
        //update output statistics (required for correctness)
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        mcOut.set(mcIn.getRows(), mcIn.getCols(), mcIn.getRowsPerBlock(), mcIn.getColsPerBlock(), (pattern != 0 && replacement != 0) ? mcIn.getNonZeros() : -1);
    } else if (opcode.equalsIgnoreCase("rexpand")) {
        String rddInVar = params.get("target");
        //get input rdd handle
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(rddInVar);
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(rddInVar);
        double maxVal = Double.parseDouble(params.get("max"));
        long lmaxVal = UtilFunctions.toLong(maxVal);
        boolean dirRows = params.get("dir").equals("rows");
        boolean cast = Boolean.parseBoolean(params.get("cast"));
        boolean ignore = Boolean.parseBoolean(params.get("ignore"));
        long brlen = mcIn.getRowsPerBlock();
        long bclen = mcIn.getColsPerBlock();
        //repartition input vector for higher degree of parallelism 
        //(avoid scenarios where few input partitions create huge outputs)
        MatrixCharacteristics mcTmp = new MatrixCharacteristics(dirRows ? lmaxVal : mcIn.getRows(), dirRows ? mcIn.getRows() : lmaxVal, (int) brlen, (int) bclen, mcIn.getRows());
        int numParts = (int) Math.min(SparkUtils.getNumPreferredPartitions(mcTmp, in), mcIn.getNumBlocks());
        if (numParts > in.getNumPartitions() * 2)
            in = in.repartition(numParts);
        //execute rexpand rows/cols operation (no shuffle required because outputs are
        //block-aligned with the input, i.e., one input block generates n output blocks)
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.flatMapToPair(new RDDRExpandFunction(maxVal, dirRows, cast, ignore, brlen, bclen));
        //store output rdd handle
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), rddInVar);
        //update output statistics (required for correctness)
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        mcOut.set(dirRows ? lmaxVal : mcIn.getRows(), dirRows ? mcIn.getRows() : lmaxVal, (int) brlen, (int) bclen, -1);
    } else if (opcode.equalsIgnoreCase("transform")) {
        // perform data transform on Spark
        try {
            DataTransform.spDataTransform(this, new FrameObject[] { sec.getFrameObject(params.get("target")) }, new MatrixObject[] { sec.getMatrixObject(output.getName()) }, ec);
        } catch (Exception e) {
            throw new DMLRuntimeException(e);
        }
    } else if (opcode.equalsIgnoreCase("transformapply")) {
        //get input RDD and meta data
        FrameObject fo = sec.getFrameObject(params.get("target"));
        JavaPairRDD<Long, FrameBlock> in = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForFrameObject(fo, InputInfo.BinaryBlockInputInfo);
        FrameBlock meta = sec.getFrameInput(params.get("meta"));
        MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(params.get("target"));
        MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
        String[] colnames = !TfMetaUtils.isIDSpecification(params.get("spec")) ? in.lookup(1L).get(0).getColumnNames() : null;
        //compute omit offset map for block shifts
        TfOffsetMap omap = null;
        if (TfMetaUtils.containsOmitSpec(params.get("spec"), colnames)) {
            omap = new TfOffsetMap(SparkUtils.toIndexedLong(in.mapToPair(new RDDTransformApplyOffsetFunction(params.get("spec"), colnames)).collect()));
        }
        //create encoder broadcast (avoiding replication per task) 
        Encoder encoder = EncoderFactory.createEncoder(params.get("spec"), colnames, fo.getSchema(), (int) fo.getNumColumns(), meta);
        mcOut.setDimension(mcIn.getRows() - ((omap != null) ? omap.getNumRmRows() : 0), encoder.getNumCols());
        Broadcast<Encoder> bmeta = sec.getSparkContext().broadcast(encoder);
        Broadcast<TfOffsetMap> bomap = (omap != null) ? sec.getSparkContext().broadcast(omap) : null;
        //execute transform apply
        JavaPairRDD<Long, FrameBlock> tmp = in.mapToPair(new RDDTransformApplyFunction(bmeta, bomap));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = FrameRDDConverterUtils.binaryBlockToMatrixBlock(tmp, mcOut, mcOut);
        //set output and maintain lineage/output characteristics
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), params.get("target"));
        ec.releaseFrameInput(params.get("meta"));
    } else if (opcode.equalsIgnoreCase("transformdecode")) {
        //get input RDD and meta data
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(params.get("target"));
        MatrixCharacteristics mc = sec.getMatrixCharacteristics(params.get("target"));
        FrameBlock meta = sec.getFrameInput(params.get("meta"));
        String[] colnames = meta.getColumnNames();
        //reblock if necessary (clen > bclen)
        if (mc.getCols() > mc.getNumColBlocks()) {
            in = in.mapToPair(new RDDTransformDecodeExpandFunction((int) mc.getCols(), mc.getColsPerBlock()));
            in = RDDAggregateUtils.mergeByKey(in, false);
        }
        //construct decoder and decode individual matrix blocks
        Decoder decoder = DecoderFactory.createDecoder(params.get("spec"), colnames, null, meta);
        JavaPairRDD<Long, FrameBlock> out = in.mapToPair(new RDDTransformDecodeFunction(decoder, mc.getRowsPerBlock()));
        //set output and maintain lineage/output characteristics
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), params.get("target"));
        ec.releaseFrameInput(params.get("meta"));
        sec.getMatrixCharacteristics(output.getName()).set(mc.getRows(), meta.getNumColumns(), mc.getRowsPerBlock(), mc.getColsPerBlock(), -1);
        sec.getFrameObject(output.getName()).setSchema(decoder.getSchema());
    } else {
        throw new DMLRuntimeException("Unknown parameterized builtin opcode: " + opcode);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ExtractGroupNWeights(org.apache.sysml.runtime.instructions.spark.functions.ExtractGroupNWeights) ReplicateVectorFunction(org.apache.sysml.runtime.instructions.spark.functions.ReplicateVectorFunction) Decoder(org.apache.sysml.runtime.transform.decode.Decoder) PartitionedBroadcast(org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) Encoder(org.apache.sysml.runtime.transform.encode.Encoder) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) AggregateOperator(org.apache.sysml.runtime.matrix.operators.AggregateOperator) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) PerformGroupByAggInReducer(org.apache.sysml.runtime.instructions.spark.functions.PerformGroupByAggInReducer) CPOperand(org.apache.sysml.runtime.instructions.cp.CPOperand) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) ExtractGroupBroadcast(org.apache.sysml.runtime.instructions.spark.functions.ExtractGroup.ExtractGroupBroadcast) TfOffsetMap(org.apache.sysml.runtime.transform.meta.TfOffsetMap) PerformGroupByAggInCombiner(org.apache.sysml.runtime.instructions.spark.functions.PerformGroupByAggInCombiner) ExtractGroupJoin(org.apache.sysml.runtime.instructions.spark.functions.ExtractGroup.ExtractGroupJoin) CMOperator(org.apache.sysml.runtime.matrix.operators.CMOperator)

Example 4 with CMOperator

use of org.apache.sysml.runtime.matrix.operators.CMOperator in project incubator-systemml by apache.

the class CM_N_COVInstruction method parseInstruction.

public static CM_N_COVInstruction parseInstruction(String str) throws DMLRuntimeException {
    String[] parts = InstructionUtils.getInstructionParts(str);
    byte in, out;
    int cst;
    String opcode = parts[0];
    if (opcode.equalsIgnoreCase("cm")) {
        in = Byte.parseByte(parts[1]);
        cst = Integer.parseInt(parts[2]);
        out = Byte.parseByte(parts[3]);
        if (cst > 4 || cst < 0 || cst == 1)
            throw new DMLRuntimeException("constant for central moment has to be 0, 2, 3, or 4");
        AggregateOperationTypes opType = CMOperator.getCMAggOpType(cst);
        CMOperator cm = new CMOperator(CM.getCMFnObject(opType), opType);
        return new CM_N_COVInstruction(cm, in, out, str);
    } else if (opcode.equalsIgnoreCase("cov")) {
        in = Byte.parseByte(parts[1]);
        out = Byte.parseByte(parts[2]);
        COVOperator cov = new COVOperator(COV.getCOMFnObject());
        return new CM_N_COVInstruction(cov, in, out, str);
    } else
        throw new DMLRuntimeException("unknown opcode " + opcode);
}
Also used : COVOperator(org.apache.sysml.runtime.matrix.operators.COVOperator) AggregateOperationTypes(org.apache.sysml.runtime.matrix.operators.CMOperator.AggregateOperationTypes) CMOperator(org.apache.sysml.runtime.matrix.operators.CMOperator) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 5 with CMOperator

use of org.apache.sysml.runtime.matrix.operators.CMOperator in project incubator-systemml by apache.

the class CentralMomentSPInstruction method parseInstruction.

public static CentralMomentSPInstruction parseInstruction(String str) throws DMLRuntimeException {
    CPOperand in1 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
    CPOperand in2 = null;
    CPOperand in3 = null;
    CPOperand out = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
    String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
    String opcode = parts[0];
    //check supported opcode
    if (!opcode.equalsIgnoreCase("cm")) {
        throw new DMLRuntimeException("Unsupported opcode " + opcode);
    }
    if (parts.length == 4) {
        // Example: CP.cm.mVar0.Var1.mVar2; (without weights)
        in2 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
        parseUnaryInstruction(str, in1, in2, out);
    } else if (parts.length == 5) {
        // CP.cm.mVar0.mVar1.Var2.mVar3; (with weights)
        in2 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
        in3 = new CPOperand("", ValueType.UNKNOWN, DataType.UNKNOWN);
        parseUnaryInstruction(str, in1, in2, in3, out);
    }
    // Exact order of the central moment MAY NOT be known at compilation time.
    // We first try to parse the second argument as an integer, and if we fail, 
    // we simply pass -1 so that getCMAggOpType() picks up AggregateOperationTypes.INVALID.
    // It must be updated at run time in processInstruction() method.
    int cmOrder;
    try {
        if (in3 == null) {
            cmOrder = Integer.parseInt(in2.getName());
        } else {
            cmOrder = Integer.parseInt(in3.getName());
        }
    } catch (NumberFormatException e) {
        // unknown at compilation time
        cmOrder = -1;
    }
    AggregateOperationTypes opType = CMOperator.getCMAggOpType(cmOrder);
    CMOperator cm = new CMOperator(CM.getCMFnObject(opType), opType);
    return new CentralMomentSPInstruction(cm, in1, in2, in3, out, opcode, str);
}
Also used : AggregateOperationTypes(org.apache.sysml.runtime.matrix.operators.CMOperator.AggregateOperationTypes) CPOperand(org.apache.sysml.runtime.instructions.cp.CPOperand) CMOperator(org.apache.sysml.runtime.matrix.operators.CMOperator) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

CMOperator (org.apache.sysml.runtime.matrix.operators.CMOperator)17 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)11 IOException (java.io.IOException)6 KahanObject (org.apache.sysml.runtime.instructions.cp.KahanObject)6 AggregateOperator (org.apache.sysml.runtime.matrix.operators.AggregateOperator)6 CM (org.apache.sysml.runtime.functionobjects.CM)4 CM_COV_Object (org.apache.sysml.runtime.instructions.cp.CM_COV_Object)4 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)4 WeightedCell (org.apache.sysml.runtime.matrix.data.WeightedCell)4 AggregateOperationTypes (org.apache.sysml.runtime.matrix.operators.CMOperator.AggregateOperationTypes)4 CPOperand (org.apache.sysml.runtime.instructions.cp.CPOperand)3 GroupedAggregateInstruction (org.apache.sysml.runtime.instructions.mr.GroupedAggregateInstruction)3 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)2 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)2 COVOperator (org.apache.sysml.runtime.matrix.operators.COVOperator)2 Operator (org.apache.sysml.runtime.matrix.operators.Operator)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 ExecutorService (java.util.concurrent.ExecutorService)1 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)1