use of water.fvec.Chunk in project h2o-2 by h2oai.
the class MRUtils method div.
public static Frame div(Frame fr, final double d) {
Frame r = new MRTask2() {
@Override
public void map(Chunk[] cs, NewChunk[] ncs) {
for (int i = 0; i < ncs.length; i++) {
NewChunk nc = ncs[i];
Chunk c = cs[i];
for (int r = 0; r < c._len; r++) if (d != 0)
nc.addNum(c.at0(r) * d);
else
nc.addNA();
}
}
}.doAll(fr.numCols(), fr).outputFrame(fr.names(), fr.domains());
return r;
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class GBMTest method highCardinality.
public void highCardinality(int nbins_cats) {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
Frame train = null, test = null, train_preds = null, test_preds = null;
Scope.enter();
try {
{
CreateFrame cf = new CreateFrame();
cf.rows = 10000;
cf.cols = 10;
cf.integer_range = 1000;
cf.categorical_fraction = 1.0;
cf.integer_fraction = 0.0;
cf.binary_fraction = 0.0;
cf.time_fraction = 0.0;
cf.string_fraction = 0.0;
cf.binary_ones_fraction = 0.0;
cf.missing_fraction = 0.2;
cf.factors = 3000;
cf.response_factors = 2;
cf.positive_response = false;
cf.has_response = true;
cf.seed = 1235;
cf.seed_for_column_types = 1234;
train = cf.execImpl().get();
}
{
CreateFrame cf = new CreateFrame();
cf.rows = 10000;
cf.cols = 10;
cf.integer_range = 1000;
cf.categorical_fraction = 1.0;
cf.integer_fraction = 0.0;
cf.binary_fraction = 0.0;
cf.time_fraction = 0.0;
cf.string_fraction = 0.0;
cf.binary_ones_fraction = 0.0;
cf.missing_fraction = 0.2;
cf.factors = 5000;
cf.response_factors = 2;
cf.positive_response = false;
cf.has_response = true;
cf.seed = 5321;
cf.seed_for_column_types = 1234;
test = cf.execImpl().get();
}
parms._train = train._key;
// Train on the outcome
parms._response_column = "response";
//allow it to overfit
parms._max_depth = 20;
parms._min_rows = 1;
parms._ntrees = 1;
parms._nbins_cats = nbins_cats;
parms._seed = 0x2834234;
GBM job = new GBM(parms);
gbm = job.trainModel().get();
train_preds = gbm.score(train);
test_preds = gbm.score(test);
new MRTask() {
public void map(Chunk c) {
for (int i = 0; i < c._len; ++i) if (c.isNA(i))
c.set(i, 0.5);
}
}.doAll(train.vec("response"));
new MRTask() {
public void map(Chunk c) {
for (int i = 0; i < c._len; ++i) if (c.isNA(i))
c.set(i, 0.5);
}
}.doAll(test.vec("response"));
Log.info("Train AUC: " + ModelMetricsBinomial.make(train_preds.vec(2), train.vec("response")).auc());
Log.info("Test AUC: " + ModelMetricsBinomial.make(test_preds.vec(2), test.vec("response")).auc());
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(train, train_preds, 1e-15));
Key old = gbm._key;
gbm._key = Key.make(gbm._key + "ha");
Assert.assertTrue(gbm.testJavaScoring(test, test_preds, 1e-15));
DKV.remove(old);
} finally {
if (gbm != null)
gbm.delete();
if (train != null)
train.remove();
if (test != null)
test.remove();
if (train_preds != null)
train_preds.remove();
if (test_preds != null)
test_preds.remove();
Scope.exit();
}
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class AstImpute method apply.
// (h2o.impute data col method combine_method groupby groupByFrame values)
@Override
public Val apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
// Argument parsing and sanity checking
// Whole frame being imputed
Frame fr = stk.track(asts[1].exec(env)).getFrame();
// Column within frame being imputed
final int col = (int) asts[2].exec(env).getNum();
if (col >= fr.numCols())
throw new IllegalArgumentException("Column not -1 or in range 0 to " + fr.numCols());
final boolean doAllVecs = col == -1;
final Vec vec = doAllVecs ? null : fr.vec(col);
// Technique used for imputation
AstRoot method = null;
boolean ffill0 = false, bfill0 = false;
switch(asts[3].exec(env).getStr().toUpperCase()) {
case "MEAN":
method = new AstMean();
break;
case "MEDIAN":
method = new AstMedian();
break;
case "MODE":
method = new AstMode();
break;
case "FFILL":
ffill0 = true;
break;
case "BFILL":
bfill0 = true;
break;
default:
throw new IllegalArgumentException("Method must be one of mean, median or mode");
}
// Only for median, how is the median computed on even sample sizes?
QuantileModel.CombineMethod combine = QuantileModel.CombineMethod.valueOf(asts[4].exec(env).getStr().toUpperCase());
// Group-by columns. Empty is allowed, and perfectly normal.
AstRoot ast = asts[5];
AstNumList by2;
if (ast instanceof AstNumList)
by2 = (AstNumList) ast;
else if (ast instanceof AstNum)
by2 = new AstNumList(((AstNum) ast).getNum());
else if (ast instanceof AstStrList) {
String[] names = ((AstStrList) ast)._strs;
double[] list = new double[names.length];
int i = 0;
for (String name : ((AstStrList) ast)._strs) list[i++] = fr.find(name);
Arrays.sort(list);
by2 = new AstNumList(list);
} else
throw new IllegalArgumentException("Requires a number-list, but found a " + ast.getClass());
Frame groupByFrame = asts[6].str().equals("_") ? null : stk.track(asts[6].exec(env)).getFrame();
AstRoot vals = asts[7];
AstNumList values;
if (vals instanceof AstNumList)
values = (AstNumList) vals;
else if (vals instanceof AstNum)
values = new AstNumList(((AstNum) vals).getNum());
else
values = null;
boolean doGrpBy = !by2.isEmpty() || groupByFrame != null;
// Compute the imputed value per-group. Empty groups are allowed and OK.
IcedHashMap<AstGroup.G, Freezable[]> group_impute_map;
if (!doGrpBy) {
// Skip the grouping work
if (ffill0 || bfill0) {
// do a forward/backward fill on the NA
// TODO: requires chk.previousNonNA and chk.nextNonNA style methods (which may go across chk boundaries)s
final boolean ffill = ffill0;
final boolean bfill = bfill0;
throw H2O.unimpl("No ffill or bfill imputation supported");
// new MRTask() {
// @Override public void map(Chunk[] cs) {
// int len=cs[0]._len; // end of this chk
// long start=cs[0].start(); // absolute beginning of chk s.t. start-1 bleeds into previous chk
// long absEnd = start+len; // absolute end of the chk s.t. absEnd+1 bleeds into next chk
// for(int c=0;c<cs.length;++c )
// for(int r=0;r<cs[0]._len;++r ) {
// if( cs[c].isNA(r) ) {
// if( r > 0 && r < len-1 ) {
// cs[c].set(r,ffill?)
// }
// }
// }
// }
// }.doAll(doAllVecs?fr:new Frame(vec));
// return new ValNum(Double.NaN);
} else {
final double[] res = values == null ? new double[fr.numCols()] : values.expand();
if (values == null) {
// fill up res if no values supplied user, common case
if (doAllVecs) {
for (int i = 0; i < res.length; ++i) if (fr.vec(i).isNumeric() || fr.vec(i).isCategorical())
res[i] = fr.vec(i).isNumeric() ? fr.vec(i).mean() : ArrayUtils.maxIndex(fr.vec(i).bins());
} else {
Arrays.fill(res, Double.NaN);
if (method instanceof AstMean)
res[col] = vec.mean();
if (method instanceof AstMedian)
res[col] = AstMedian.median(new Frame(vec), combine);
if (method instanceof AstMode)
res[col] = AstMode.mode(vec);
}
}
new MRTask() {
@Override
public void map(Chunk[] cs) {
int len = cs[0]._len;
// run down each chk
for (int c = 0; c < cs.length; ++c) if (!Double.isNaN(res[c]))
for (int row = 0; row < len; ++row) if (cs[c].isNA(row))
cs[c].set(row, res[c]);
}
}.doAll(fr);
return new ValNums(res);
}
} else {
if (col >= fr.numCols())
throw new IllegalArgumentException("Column not -1 or in range 0 to " + fr.numCols());
Frame imputes = groupByFrame;
if (imputes == null) {
// Build and run a GroupBy command
AstGroup ast_grp = new AstGroup();
// simple case where user specified a column... col == -1 means do all columns
if (doAllVecs) {
AstRoot[] aggs = new AstRoot[(int) (3 + 3 * (fr.numCols() - by2.cnt()))];
aggs[0] = ast_grp;
aggs[1] = new AstFrame(fr);
aggs[2] = by2;
int c = 3;
for (int i = 0; i < fr.numCols(); ++i) {
if (!by2.has(i) && (fr.vec(i).isCategorical() || fr.vec(i).isNumeric())) {
aggs[c] = fr.vec(i).isNumeric() ? new AstMean() : new AstMode();
aggs[c + 1] = new AstNumList(i, i + 1);
aggs[c + 2] = new AstStr("rm");
c += 3;
}
}
imputes = ast_grp.apply(env, stk, aggs).getFrame();
} else
imputes = ast_grp.apply(env, stk, new AstRoot[] { ast_grp, new AstFrame(fr), by2, /**/
method, new AstNumList(col, col + 1), new AstStr("rm") }).getFrame();
}
if (// >2 makes it ambiguous which columns are groupby cols and which are aggs, throw IAE
by2.isEmpty() && imputes.numCols() > 2)
throw new IllegalArgumentException("Ambiguous group-by frame. Supply the `by` columns to proceed.");
final int[] bycols0 = ArrayUtils.seq(0, Math.max((int) by2.cnt(), 1));
group_impute_map = new Gather(by2.expand4(), bycols0, fr.numCols(), col).doAll(imputes)._group_impute_map;
// Now walk over the data, replace NAs with the imputed results
final IcedHashMap<AstGroup.G, Freezable[]> final_group_impute_map = group_impute_map;
if (by2.isEmpty()) {
int[] byCols = new int[imputes.numCols() - 1];
for (int i = 0; i < byCols.length; ++i) byCols[i] = fr.find(imputes.name(i));
by2 = new AstNumList(byCols);
}
final int[] bycols = by2.expand4();
new MRTask() {
@Override
public void map(Chunk[] cs) {
Set<Integer> _bycolz = new HashSet<>();
for (int b : bycols) _bycolz.add(b);
AstGroup.G g = new AstGroup.G(bycols.length, null);
for (int row = 0; row < cs[0]._len; row++) for (int c = 0; c < cs.length; ++c) if (!_bycolz.contains(c))
if (cs[c].isNA(row))
cs[c].set(row, ((IcedDouble) final_group_impute_map.get(g.fill(row, cs, bycols))[c])._val);
}
}.doAll(fr);
return new ValFrame(imputes);
}
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class AstTable method fast_table.
// -------------------------------------------------------------------------
// Fast-path for 1 integer column
private ValFrame fast_table(Vec v1, int ncols, String colname) {
if (ncols != 1 || !v1.isInt())
return null;
long spanl = (long) v1.max() - (long) v1.min() + 1;
// Cap at decent array size, for performance
if (spanl > 1000000)
return null;
// First fast-pass counting
AstTable.FastCnt fastCnt = new AstTable.FastCnt((long) v1.min(), (int) spanl).doAll(v1);
final long[] cnts = fastCnt._cnts;
final long minVal = fastCnt._min;
// Second pass to build the result frame, skipping zeros
Vec dataLayoutVec = Vec.makeCon(0, cnts.length);
Frame fr = new MRTask() {
@Override
public void map(Chunk[] cs, NewChunk nc0, NewChunk nc1) {
final Chunk c = cs[0];
for (int i = 0; i < c._len; ++i) {
int idx = (int) (i + c.start());
if (cnts[idx] > 0) {
nc0.addNum(idx + minVal);
nc1.addNum(cnts[idx]);
}
}
}
}.doAll(new byte[] { Vec.T_NUM, Vec.T_NUM }, dataLayoutVec).outputFrame(new String[] { colname, "Count" }, new String[][] { v1.domain(), null });
dataLayoutVec.remove();
return new ValFrame(fr);
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class DataInfoTestAdapt method checkFrame.
private void checkFrame(final DataInfo di, final Frame gold, final boolean skipMissing) {
try {
Vec[] vecs = new Vec[di._adaptedFrame.numCols() + gold.numCols()];
System.arraycopy(di._adaptedFrame.vecs(), 0, vecs, 0, di._adaptedFrame.numCols());
System.arraycopy(gold.vecs(), 0, vecs, di._adaptedFrame.numCols(), gold.numCols());
new MRTask() {
@Override
public void map(Chunk[] cs) {
int off = di._adaptedFrame.numCols();
DataInfo.Row r = di.newDenseRow();
// DataInfo.Row rows[] = di.extractSparseRows(cs);
for (int i = 0; i < cs[0]._len; ++i) {
// DataInfo.Row r = rows[i];
di.extractDenseRow(cs, i, r);
if (skipMissing && r.isBad())
continue;
for (int j = 0; j < di.fullN(); ++j) {
double goldValue = cs[off + j].atd(i);
// - (di._normSub[j - di.numStart()] * di._normMul[j-di.numStart()]);
double thisValue = r.get(j);
double diff = Math.abs(goldValue - thisValue);
if (diff > 1e-12) {
if (!skipMissing && diff < 10)
System.out.println("row mismatch: " + i + " column= " + j + "; diff= " + diff + " but not skipping missing, so due to discrepancies in taking mean on split frames");
else
throw new RuntimeException("bonk");
}
}
}
}
}.doAll(vecs);
} finally {
di.dropInteractions();
di.remove();
}
}
Aggregations