use of org.apache.sysml.runtime.instructions.cp.CM_COV_Object in project incubator-systemml by apache.
the class MVImputeAgent method prepVarOutput.
private DistinctValue prepVarOutput(int taskID, int idx, StringBuilder sb, boolean scnomv) throws CharacterCodingException {
if (scnomv || _isMVScaled.get(idx) && _mvscMethodList[idx] == MVMethod.GLOBAL_MODE) {
sb.setLength(0);
sb.append(VARIANCE_PREFIX);
sb.append("_");
sb.append(taskID);
sb.append("_");
CM_COV_Object cm = (scnomv ? _scnomvVarList[idx] : _varList[idx]);
sb.append(encodeCMObj(cm));
return new DistinctValue(sb.toString(), -1L);
}
return null;
}
use of org.apache.sysml.runtime.instructions.cp.CM_COV_Object in project incubator-systemml by apache.
the class MVImputeAgent method mergeAndOutputTransformationMetadata.
/**
* Method to merge map output transformation metadata.
*/
@Override
public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
double min = Double.MAX_VALUE;
double max = -Double.MAX_VALUE;
int nbins = 0;
double d;
long totalRecordCount = 0, totalValidCount = 0;
String mvConstReplacement = null;
DistinctValue val = new DistinctValue();
String w = null;
class MeanObject {
double mean, correction;
long count;
MeanObject() {
}
public String toString() {
return mean + "," + correction + "," + count;
}
}
;
HashMap<Integer, MeanObject> mapMeans = new HashMap<Integer, MeanObject>();
HashMap<Integer, CM_COV_Object> mapVars = new HashMap<Integer, CM_COV_Object>();
boolean isImputed = false;
boolean isScaled = false;
boolean isBinned = false;
while (values.hasNext()) {
val.reset();
val = values.next();
w = val.getWord();
if (w.startsWith(MEAN_PREFIX)) {
String[] parts = w.split("_");
int taskID = UtilFunctions.parseToInt(parts[1]);
MeanObject mo = mapMeans.get(taskID);
if (mo == null)
mo = new MeanObject();
mo.mean = UtilFunctions.parseToDouble(parts[2].split(",")[0]);
// check if this attribute is scaled
String s = parts[2].split(",")[1];
if (s.equalsIgnoreCase("scmv"))
isScaled = isImputed = true;
else if (s.equalsIgnoreCase("scnomv"))
isScaled = true;
else
isImputed = true;
mapMeans.put(taskID, mo);
} else if (w.startsWith(CORRECTION_PREFIX)) {
String[] parts = w.split("_");
int taskID = UtilFunctions.parseToInt(parts[1]);
MeanObject mo = mapMeans.get(taskID);
if (mo == null)
mo = new MeanObject();
mo.correction = UtilFunctions.parseToDouble(parts[2]);
mapMeans.put(taskID, mo);
} else if (w.startsWith(CONSTANT_PREFIX)) {
isImputed = true;
String[] parts = w.split("_");
mvConstReplacement = parts[1];
} else if (w.startsWith(COUNT_PREFIX)) {
String[] parts = w.split("_");
int taskID = UtilFunctions.parseToInt(parts[1]);
MeanObject mo = mapMeans.get(taskID);
if (mo == null)
mo = new MeanObject();
mo.count = UtilFunctions.parseToLong(parts[2]);
totalValidCount += mo.count;
mapMeans.put(taskID, mo);
} else if (w.startsWith(TOTAL_COUNT_PREFIX)) {
String[] parts = w.split("_");
//int taskID = UtilFunctions.parseToInt(parts[1]);
totalRecordCount += UtilFunctions.parseToLong(parts[2]);
} else if (w.startsWith(VARIANCE_PREFIX)) {
isScaled = true;
String[] parts = w.split("_");
int taskID = UtilFunctions.parseToInt(parts[1]);
CM_COV_Object cm = decodeCMObj(parts[2]);
mapVars.put(taskID, cm);
} else if (w.startsWith(BinAgent.MIN_PREFIX)) {
isBinned = true;
d = UtilFunctions.parseToDouble(w.substring(BinAgent.MIN_PREFIX.length()));
if (d < min)
min = d;
} else if (w.startsWith(BinAgent.MAX_PREFIX)) {
isBinned = true;
d = UtilFunctions.parseToDouble(w.substring(BinAgent.MAX_PREFIX.length()));
if (d > max)
max = d;
} else if (w.startsWith(BinAgent.NBINS_PREFIX)) {
isBinned = true;
nbins = (int) UtilFunctions.parseToLong(w.substring(BinAgent.NBINS_PREFIX.length()));
} else
throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w);
}
// compute global mean across all map outputs
KahanObject gmean = new KahanObject(0, 0);
KahanPlus kp = KahanPlus.getKahanPlusFnObject();
long gcount = 0;
for (MeanObject mo : mapMeans.values()) {
gcount = gcount + mo.count;
if (gcount > 0) {
double delta = mo.mean - gmean._sum;
kp.execute2(gmean, delta * mo.count / gcount);
//_meanFn.execute2(gmean, mo.mean*mo.count, gcount);
}
}
// compute global variance across all map outputs
CM_COV_Object gcm = new CM_COV_Object();
try {
for (CM_COV_Object cm : mapVars.values()) gcm = (CM_COV_Object) _varFn.execute(gcm, cm);
} catch (DMLRuntimeException e) {
throw new IOException(e);
}
// If the column is imputed with a constant, then adjust min and max based the value of the constant.
if (isImputed && isBinned && mvConstReplacement != null) {
double cst = UtilFunctions.parseToDouble(mvConstReplacement);
if (cst < min)
min = cst;
if (cst > max)
max = cst;
}
// write merged metadata
if (isImputed) {
String imputedValue = null;
if (mvConstReplacement != null)
imputedValue = mvConstReplacement;
else
imputedValue = Double.toString(gcount == 0 ? 0.0 : gmean._sum);
writeTfMtd(colID, imputedValue, outputDir, fs, agents);
}
if (isBinned) {
double binwidth = (max - min) / nbins;
writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents);
}
if (isScaled) {
try {
if (totalValidCount != totalRecordCount) {
// In the presence of missing values, the variance needs to be adjusted.
// The mean does not need to be adjusted, when mv impute method is global_mean,
// since missing values themselves are replaced with gmean.
long totalMissingCount = (totalRecordCount - totalValidCount);
int idx = isApplicable(colID);
if (idx != -1 && _mvMethodList[idx] == MVMethod.CONSTANT)
_meanFn.execute(gmean, UtilFunctions.parseToDouble(_replacementList[idx]), totalRecordCount);
_varFn.execute(gcm, gmean._sum, totalMissingCount);
}
double mean = (gcount == 0 ? 0.0 : gmean._sum);
double var = gcm.getRequiredResult(new CMOperator(_varFn, AggregateOperationTypes.VARIANCE));
double sdev = (mapVars.size() > 0 ? Math.sqrt(var) : -1.0);
writeTfMtd(colID, Double.toString(mean), Double.toString(sdev), outputDir, fs, agents);
} catch (DMLRuntimeException e) {
throw new IOException(e);
}
}
}
use of org.apache.sysml.runtime.instructions.cp.CM_COV_Object in project incubator-systemml by apache.
the class CentralMomentSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// parse 'order' input argument
CPOperand scalarInput = (input3 == null ? input2 : input3);
ScalarObject order = ec.getScalarInput(scalarInput.getName(), scalarInput.getValueType(), scalarInput.isLiteral());
CMOperator cop = ((CMOperator) _optr);
if (cop.getAggOpType() == AggregateOperationTypes.INVALID) {
cop.setCMAggOp((int) order.getLongValue());
}
// get input
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
// process central moment instruction
CM_COV_Object cmobj = null;
if (// w/o weights
input3 == null) {
cmobj = in1.values().map(new RDDCMFunction(cop)).fold(new CM_COV_Object(), new RDDCMReduceFunction(cop));
} else // with weights
{
JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
cmobj = in1.join(in2).values().map(new RDDCMWeightsFunction(cop)).fold(new CM_COV_Object(), new RDDCMReduceFunction(cop));
}
// create scalar output (no lineage information required)
double val = cmobj.getRequiredResult(_optr);
ec.setScalarOutput(output.getName(), new DoubleObject(val));
}
use of org.apache.sysml.runtime.instructions.cp.CM_COV_Object in project incubator-systemml by apache.
the class COV method execute.
/**
* Special case for weights w2==1
*
* @param in1 ?
* @param u ?
* @param v ?
* @return result
*/
@Override
public Data execute(Data in1, double u, double v) {
CM_COV_Object cov1 = (CM_COV_Object) in1;
if (cov1.isCOVAllZeros()) {
cov1.w = 1L;
cov1.mean.set(u, 0);
cov1.mean_v.set(v, 0);
cov1.c2.set(0, 0);
return cov1;
}
double w = cov1.w + 1;
double du = u - cov1.mean._sum;
double dv = v - cov1.mean_v._sum;
cov1.mean = (KahanObject) _plus.execute(cov1.mean, du / w);
cov1.mean_v = (KahanObject) _plus.execute(cov1.mean_v, dv / w);
cov1.c2 = (KahanObject) _plus.execute(cov1.c2, cov1.w / w * du * dv);
cov1.w = w;
return cov1;
}
use of org.apache.sysml.runtime.instructions.cp.CM_COV_Object in project incubator-systemml by apache.
the class CM method execute.
/**
* Special case for weights w2==1
*/
@Override
public Data execute(Data in1, double in2) {
CM_COV_Object cm1 = (CM_COV_Object) in1;
if (cm1.isCMAllZeros()) {
cm1.w = 1;
cm1.mean.set(in2, 0);
cm1.m2.set(0, 0);
cm1.m3.set(0, 0);
cm1.m4.set(0, 0);
return cm1;
}
switch(_type) {
case COUNT:
{
cm1.w = cm1.w + 1;
break;
}
case MEAN:
{
double w = cm1.w + 1;
double d = in2 - cm1.mean._sum;
cm1.mean = (KahanObject) _plus.execute(cm1.mean, d / w);
cm1.w = w;
break;
}
case CM2:
{
double w = cm1.w + 1;
double d = in2 - cm1.mean._sum;
cm1.mean = (KahanObject) _plus.execute(cm1.mean, d / w);
double t1 = cm1.w / w * d;
double lt1 = t1 * d;
_buff2.set(cm1.m2);
_buff2 = (KahanObject) _plus.execute(_buff2, lt1);
cm1.m2.set(_buff2);
cm1.w = w;
break;
}
case CM3:
{
double w = cm1.w + 1;
double d = in2 - cm1.mean._sum;
cm1.mean = (KahanObject) _plus.execute(cm1.mean, d / w);
double t1 = cm1.w / w * d;
double t2 = -1 / cm1.w;
double lt1 = t1 * d;
double lt2 = Math.pow(t1, 3) * (1.0 - Math.pow(t2, 2));
double f2 = 1.0 / w;
_buff2.set(cm1.m2);
_buff2 = (KahanObject) _plus.execute(_buff2, lt1);
_buff3.set(cm1.m3);
_buff3 = (KahanObject) _plus.execute(_buff3, lt2 - 3 * cm1.m2._sum * f2 * d);
cm1.m2.set(_buff2);
cm1.m3.set(_buff3);
cm1.w = w;
break;
}
case CM4:
{
double w = cm1.w + 1;
double d = in2 - cm1.mean._sum;
cm1.mean = (KahanObject) _plus.execute(cm1.mean, d / w);
double t1 = cm1.w / w * d;
double t2 = -1 / cm1.w;
double lt1 = t1 * d;
double lt2 = Math.pow(t1, 3) * (1.0 - Math.pow(t2, 2));
double lt3 = Math.pow(t1, 4) * (1.0 - Math.pow(t2, 3));
double f2 = 1.0 / w;
_buff2.set(cm1.m2);
_buff2 = (KahanObject) _plus.execute(_buff2, lt1);
_buff3.set(cm1.m3);
_buff3 = (KahanObject) _plus.execute(_buff3, lt2 - 3 * cm1.m2._sum * f2 * d);
cm1.m4 = (KahanObject) _plus.execute(cm1.m4, 6 * cm1.m2._sum * Math.pow(-f2 * d, 2) + lt3 - 4 * cm1.m3._sum * f2 * d);
cm1.m2.set(_buff2);
cm1.m3.set(_buff3);
cm1.w = w;
break;
}
case VARIANCE:
{
double w = cm1.w + 1;
double d = in2 - cm1.mean._sum;
cm1.mean = (KahanObject) _plus.execute(cm1.mean, d / w);
double t1 = cm1.w / w * d;
double lt1 = t1 * d;
cm1.m2 = (KahanObject) _plus.execute(cm1.m2, lt1);
cm1.w = w;
break;
}
default:
throw new DMLRuntimeException("Unsupported operation type: " + _type);
}
return cm1;
}
Aggregations