Search in sources :

Example 1 with CM_COV_Object

use of org.apache.sysml.runtime.instructions.cp.CM_COV_Object in project incubator-systemml by apache.

the class MVImputeAgent method prepVarOutput.

private DistinctValue prepVarOutput(int taskID, int idx, StringBuilder sb, boolean scnomv) throws CharacterCodingException {
    if (scnomv || _isMVScaled.get(idx) && _mvscMethodList[idx] == MVMethod.GLOBAL_MODE) {
        sb.setLength(0);
        sb.append(VARIANCE_PREFIX);
        sb.append("_");
        sb.append(taskID);
        sb.append("_");
        CM_COV_Object cm = (scnomv ? _scnomvVarList[idx] : _varList[idx]);
        sb.append(encodeCMObj(cm));
        return new DistinctValue(sb.toString(), -1L);
    }
    return null;
}
Also used : CM_COV_Object(org.apache.sysml.runtime.instructions.cp.CM_COV_Object)

Example 2 with CM_COV_Object

use of org.apache.sysml.runtime.instructions.cp.CM_COV_Object in project incubator-systemml by apache.

the class MVImputeAgent method mergeAndOutputTransformationMetadata.

/** 
	 * Method to merge map output transformation metadata. 
	 */
@Override
public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
    double min = Double.MAX_VALUE;
    double max = -Double.MAX_VALUE;
    int nbins = 0;
    double d;
    long totalRecordCount = 0, totalValidCount = 0;
    String mvConstReplacement = null;
    DistinctValue val = new DistinctValue();
    String w = null;
    class MeanObject {

        double mean, correction;

        long count;

        MeanObject() {
        }

        public String toString() {
            return mean + "," + correction + "," + count;
        }
    }
    ;
    HashMap<Integer, MeanObject> mapMeans = new HashMap<Integer, MeanObject>();
    HashMap<Integer, CM_COV_Object> mapVars = new HashMap<Integer, CM_COV_Object>();
    boolean isImputed = false;
    boolean isScaled = false;
    boolean isBinned = false;
    while (values.hasNext()) {
        val.reset();
        val = values.next();
        w = val.getWord();
        if (w.startsWith(MEAN_PREFIX)) {
            String[] parts = w.split("_");
            int taskID = UtilFunctions.parseToInt(parts[1]);
            MeanObject mo = mapMeans.get(taskID);
            if (mo == null)
                mo = new MeanObject();
            mo.mean = UtilFunctions.parseToDouble(parts[2].split(",")[0]);
            // check if this attribute is scaled
            String s = parts[2].split(",")[1];
            if (s.equalsIgnoreCase("scmv"))
                isScaled = isImputed = true;
            else if (s.equalsIgnoreCase("scnomv"))
                isScaled = true;
            else
                isImputed = true;
            mapMeans.put(taskID, mo);
        } else if (w.startsWith(CORRECTION_PREFIX)) {
            String[] parts = w.split("_");
            int taskID = UtilFunctions.parseToInt(parts[1]);
            MeanObject mo = mapMeans.get(taskID);
            if (mo == null)
                mo = new MeanObject();
            mo.correction = UtilFunctions.parseToDouble(parts[2]);
            mapMeans.put(taskID, mo);
        } else if (w.startsWith(CONSTANT_PREFIX)) {
            isImputed = true;
            String[] parts = w.split("_");
            mvConstReplacement = parts[1];
        } else if (w.startsWith(COUNT_PREFIX)) {
            String[] parts = w.split("_");
            int taskID = UtilFunctions.parseToInt(parts[1]);
            MeanObject mo = mapMeans.get(taskID);
            if (mo == null)
                mo = new MeanObject();
            mo.count = UtilFunctions.parseToLong(parts[2]);
            totalValidCount += mo.count;
            mapMeans.put(taskID, mo);
        } else if (w.startsWith(TOTAL_COUNT_PREFIX)) {
            String[] parts = w.split("_");
            //int taskID = UtilFunctions.parseToInt(parts[1]);
            totalRecordCount += UtilFunctions.parseToLong(parts[2]);
        } else if (w.startsWith(VARIANCE_PREFIX)) {
            isScaled = true;
            String[] parts = w.split("_");
            int taskID = UtilFunctions.parseToInt(parts[1]);
            CM_COV_Object cm = decodeCMObj(parts[2]);
            mapVars.put(taskID, cm);
        } else if (w.startsWith(BinAgent.MIN_PREFIX)) {
            isBinned = true;
            d = UtilFunctions.parseToDouble(w.substring(BinAgent.MIN_PREFIX.length()));
            if (d < min)
                min = d;
        } else if (w.startsWith(BinAgent.MAX_PREFIX)) {
            isBinned = true;
            d = UtilFunctions.parseToDouble(w.substring(BinAgent.MAX_PREFIX.length()));
            if (d > max)
                max = d;
        } else if (w.startsWith(BinAgent.NBINS_PREFIX)) {
            isBinned = true;
            nbins = (int) UtilFunctions.parseToLong(w.substring(BinAgent.NBINS_PREFIX.length()));
        } else
            throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w);
    }
    // compute global mean across all map outputs
    KahanObject gmean = new KahanObject(0, 0);
    KahanPlus kp = KahanPlus.getKahanPlusFnObject();
    long gcount = 0;
    for (MeanObject mo : mapMeans.values()) {
        gcount = gcount + mo.count;
        if (gcount > 0) {
            double delta = mo.mean - gmean._sum;
            kp.execute2(gmean, delta * mo.count / gcount);
        //_meanFn.execute2(gmean, mo.mean*mo.count, gcount);
        }
    }
    // compute global variance across all map outputs
    CM_COV_Object gcm = new CM_COV_Object();
    try {
        for (CM_COV_Object cm : mapVars.values()) gcm = (CM_COV_Object) _varFn.execute(gcm, cm);
    } catch (DMLRuntimeException e) {
        throw new IOException(e);
    }
    // If the column is imputed with a constant, then adjust min and max based the value of the constant.
    if (isImputed && isBinned && mvConstReplacement != null) {
        double cst = UtilFunctions.parseToDouble(mvConstReplacement);
        if (cst < min)
            min = cst;
        if (cst > max)
            max = cst;
    }
    // write merged metadata
    if (isImputed) {
        String imputedValue = null;
        if (mvConstReplacement != null)
            imputedValue = mvConstReplacement;
        else
            imputedValue = Double.toString(gcount == 0 ? 0.0 : gmean._sum);
        writeTfMtd(colID, imputedValue, outputDir, fs, agents);
    }
    if (isBinned) {
        double binwidth = (max - min) / nbins;
        writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents);
    }
    if (isScaled) {
        try {
            if (totalValidCount != totalRecordCount) {
                // In the presence of missing values, the variance needs to be adjusted.
                // The mean does not need to be adjusted, when mv impute method is global_mean, 
                // since missing values themselves are replaced with gmean.
                long totalMissingCount = (totalRecordCount - totalValidCount);
                int idx = isApplicable(colID);
                if (idx != -1 && _mvMethodList[idx] == MVMethod.CONSTANT)
                    _meanFn.execute(gmean, UtilFunctions.parseToDouble(_replacementList[idx]), totalRecordCount);
                _varFn.execute(gcm, gmean._sum, totalMissingCount);
            }
            double mean = (gcount == 0 ? 0.0 : gmean._sum);
            double var = gcm.getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE));
            double sdev = (mapVars.size() > 0 ? Math.sqrt(var) : -1.0);
            writeTfMtd(colID, Double.toString(mean), Double.toString(sdev), outputDir, fs, agents);
        } catch (DMLRuntimeException e) {
            throw new IOException(e);
        }
    }
}
Also used : CM_COV_Object(org.apache.sysml.runtime.instructions.cp.CM_COV_Object) HashMap(java.util.HashMap) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) KahanObject(org.apache.sysml.runtime.instructions.cp.KahanObject) KahanPlus(org.apache.sysml.runtime.functionobjects.KahanPlus) CMOperator(org.apache.sysml.runtime.matrix.operators.CMOperator)

Example 3 with CM_COV_Object

use of org.apache.sysml.runtime.instructions.cp.CM_COV_Object in project incubator-systemml by apache.

the class CentralMomentSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // parse 'order' input argument
    CPOperand scalarInput = (input3 == null ? input2 : input3);
    ScalarObject order = ec.getScalarInput(scalarInput.getName(), scalarInput.getValueType(), scalarInput.isLiteral());
    CMOperator cop = ((CMOperator) _optr);
    if (cop.getAggOpType() == AggregateOperationTypes.INVALID) {
        cop.setCMAggOp((int) order.getLongValue());
    }
    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    // process central moment instruction
    CM_COV_Object cmobj = null;
    if (// w/o weights
    input3 == null) {
        cmobj = in1.values().map(new RDDCMFunction(cop)).fold(new CM_COV_Object(), new RDDCMReduceFunction(cop));
    } else // with weights
    {
        JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
        cmobj = in1.join(in2).values().map(new RDDCMWeightsFunction(cop)).fold(new CM_COV_Object(), new RDDCMReduceFunction(cop));
    }
    // create scalar output (no lineage information required)
    double val = cmobj.getRequiredResult(_optr);
    ec.setScalarOutput(output.getName(), new DoubleObject(val));
}
Also used : CM_COV_Object(org.apache.sysml.runtime.instructions.cp.CM_COV_Object) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DoubleObject(org.apache.sysml.runtime.instructions.cp.DoubleObject) CPOperand(org.apache.sysml.runtime.instructions.cp.CPOperand) ScalarObject(org.apache.sysml.runtime.instructions.cp.ScalarObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) CMOperator(org.apache.sysml.runtime.matrix.operators.CMOperator)

Example 4 with CM_COV_Object

use of org.apache.sysml.runtime.instructions.cp.CM_COV_Object in project incubator-systemml by apache.

the class COV method execute.

/**
 * Special case for weights w2==1
 *
 * @param in1 ?
 * @param u ?
 * @param v ?
 * @return result
 */
@Override
public Data execute(Data in1, double u, double v) {
    CM_COV_Object cov1 = (CM_COV_Object) in1;
    if (cov1.isCOVAllZeros()) {
        cov1.w = 1L;
        cov1.mean.set(u, 0);
        cov1.mean_v.set(v, 0);
        cov1.c2.set(0, 0);
        return cov1;
    }
    double w = cov1.w + 1;
    double du = u - cov1.mean._sum;
    double dv = v - cov1.mean_v._sum;
    cov1.mean = (KahanObject) _plus.execute(cov1.mean, du / w);
    cov1.mean_v = (KahanObject) _plus.execute(cov1.mean_v, dv / w);
    cov1.c2 = (KahanObject) _plus.execute(cov1.c2, cov1.w / w * du * dv);
    cov1.w = w;
    return cov1;
}
Also used : CM_COV_Object(org.apache.sysml.runtime.instructions.cp.CM_COV_Object)

Example 5 with CM_COV_Object

use of org.apache.sysml.runtime.instructions.cp.CM_COV_Object in project incubator-systemml by apache.

the class CM method execute.

/**
 * Special case for weights w2==1
 */
@Override
public Data execute(Data in1, double in2) {
    CM_COV_Object cm1 = (CM_COV_Object) in1;
    if (cm1.isCMAllZeros()) {
        cm1.w = 1;
        cm1.mean.set(in2, 0);
        cm1.m2.set(0, 0);
        cm1.m3.set(0, 0);
        cm1.m4.set(0, 0);
        return cm1;
    }
    switch(_type) {
        case COUNT:
            {
                cm1.w = cm1.w + 1;
                break;
            }
        case MEAN:
            {
                double w = cm1.w + 1;
                double d = in2 - cm1.mean._sum;
                cm1.mean = (KahanObject) _plus.execute(cm1.mean, d / w);
                cm1.w = w;
                break;
            }
        case CM2:
            {
                double w = cm1.w + 1;
                double d = in2 - cm1.mean._sum;
                cm1.mean = (KahanObject) _plus.execute(cm1.mean, d / w);
                double t1 = cm1.w / w * d;
                double lt1 = t1 * d;
                _buff2.set(cm1.m2);
                _buff2 = (KahanObject) _plus.execute(_buff2, lt1);
                cm1.m2.set(_buff2);
                cm1.w = w;
                break;
            }
        case CM3:
            {
                double w = cm1.w + 1;
                double d = in2 - cm1.mean._sum;
                cm1.mean = (KahanObject) _plus.execute(cm1.mean, d / w);
                double t1 = cm1.w / w * d;
                double t2 = -1 / cm1.w;
                double lt1 = t1 * d;
                double lt2 = Math.pow(t1, 3) * (1.0 - Math.pow(t2, 2));
                double f2 = 1.0 / w;
                _buff2.set(cm1.m2);
                _buff2 = (KahanObject) _plus.execute(_buff2, lt1);
                _buff3.set(cm1.m3);
                _buff3 = (KahanObject) _plus.execute(_buff3, lt2 - 3 * cm1.m2._sum * f2 * d);
                cm1.m2.set(_buff2);
                cm1.m3.set(_buff3);
                cm1.w = w;
                break;
            }
        case CM4:
            {
                double w = cm1.w + 1;
                double d = in2 - cm1.mean._sum;
                cm1.mean = (KahanObject) _plus.execute(cm1.mean, d / w);
                double t1 = cm1.w / w * d;
                double t2 = -1 / cm1.w;
                double lt1 = t1 * d;
                double lt2 = Math.pow(t1, 3) * (1.0 - Math.pow(t2, 2));
                double lt3 = Math.pow(t1, 4) * (1.0 - Math.pow(t2, 3));
                double f2 = 1.0 / w;
                _buff2.set(cm1.m2);
                _buff2 = (KahanObject) _plus.execute(_buff2, lt1);
                _buff3.set(cm1.m3);
                _buff3 = (KahanObject) _plus.execute(_buff3, lt2 - 3 * cm1.m2._sum * f2 * d);
                cm1.m4 = (KahanObject) _plus.execute(cm1.m4, 6 * cm1.m2._sum * Math.pow(-f2 * d, 2) + lt3 - 4 * cm1.m3._sum * f2 * d);
                cm1.m2.set(_buff2);
                cm1.m3.set(_buff3);
                cm1.w = w;
                break;
            }
        case VARIANCE:
            {
                double w = cm1.w + 1;
                double d = in2 - cm1.mean._sum;
                cm1.mean = (KahanObject) _plus.execute(cm1.mean, d / w);
                double t1 = cm1.w / w * d;
                double lt1 = t1 * d;
                cm1.m2 = (KahanObject) _plus.execute(cm1.m2, lt1);
                cm1.w = w;
                break;
            }
        default:
            throw new DMLRuntimeException("Unsupported operation type: " + _type);
    }
    return cm1;
}
Also used : CM_COV_Object(org.apache.sysml.runtime.instructions.cp.CM_COV_Object) KahanObject(org.apache.sysml.runtime.instructions.cp.KahanObject) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

CM_COV_Object (org.apache.sysml.runtime.instructions.cp.CM_COV_Object)31 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)19 KahanObject (org.apache.sysml.runtime.instructions.cp.KahanObject)15 CM (org.apache.sysml.runtime.functionobjects.CM)12 KahanPlus (org.apache.sysml.runtime.functionobjects.KahanPlus)7 CMOperator (org.apache.sysml.runtime.matrix.operators.CMOperator)7 Builtin (org.apache.sysml.runtime.functionobjects.Builtin)6 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)4 KahanPlusSq (org.apache.sysml.runtime.functionobjects.KahanPlusSq)4 Mean (org.apache.sysml.runtime.functionobjects.Mean)4 ReduceAll (org.apache.sysml.runtime.functionobjects.ReduceAll)4 ReduceCol (org.apache.sysml.runtime.functionobjects.ReduceCol)4 ReduceDiag (org.apache.sysml.runtime.functionobjects.ReduceDiag)4 ReduceRow (org.apache.sysml.runtime.functionobjects.ReduceRow)4 DoubleObject (org.apache.sysml.runtime.instructions.cp.DoubleObject)4 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)4 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)4 WeightedCell (org.apache.sysml.runtime.matrix.data.WeightedCell)4 AggregateOperator (org.apache.sysml.runtime.matrix.operators.AggregateOperator)4 COVOperator (org.apache.sysml.runtime.matrix.operators.COVOperator)4