use of water.fvec.Chunk in project h2o-3 by h2oai.
the class ShuffleSplitFrameTest method testShuffleSplitWithMultipleColumns.
@Test
public /* this test makes sure that the rows of the split frames are preserved (including UUID) */
void testShuffleSplitWithMultipleColumns() {
long[] chunkLayout = ar(2L, 2L, 3L);
String[][] data = ar(ar("1", "2"), ar(null, "3"), ar("4", "5", "6"));
Frame f = null;
Frame tmpFrm = createFrame("test1.hex", chunkLayout, data);
try {
f = new MRTask() {
@Override
public void map(Chunk[] cs, NewChunk[] ncs) {
for (int i = 0; i < cs[0]._len; i++) {
BufferedString bs = cs[0].atStr(new BufferedString(), i);
int val = bs == null ? 0 : Integer.parseInt(bs.toString());
ncs[0].addStr(bs);
ncs[1].addNum(val);
ncs[2].addNum(i);
ncs[3].addUUID(i, val);
}
}
}.doAll(new byte[] { Vec.T_STR, Vec.T_NUM, Vec.T_NUM, Vec.T_UUID }, tmpFrm).outputFrame();
} finally {
tmpFrm.delete();
}
testScenario(f, flat(data), new MRTask() {
@Override
public void map(Chunk[] cs) {
for (int i = 0; i < cs[0]._len; i++) {
BufferedString bs = cs[0].atStr(new BufferedString(), i);
int expectedVal = bs == null ? 0 : Integer.parseInt(bs.toString());
int expectedIndex = (int) cs[2].atd(i);
Assert.assertEquals((double) expectedVal, cs[1].atd(i), 0.00001);
Assert.assertEquals(expectedIndex, (int) cs[3].at16l(i));
Assert.assertEquals(expectedVal, (int) cs[3].at16h(i));
}
}
});
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class AstUniOp method exec.
@Override
public Val exec(Val... args) {
Val val = args[1];
switch(val.type()) {
case Val.NUM:
return new ValNum(op(val.getNum()));
case Val.FRM:
Frame fr = val.getFrame();
for (int i = 0; i < fr.numCols(); i++) if (!fr.vec(i).isNumeric())
throw new IllegalArgumentException("Operator " + str() + "() cannot be applied to non-numeric column " + fr.name(i));
// Get length of columns in fr and append `op(colName)`. For example, a column named "income" that had
// a log transformation would now be changed to `log(income)`.
String[] newNames = new String[fr.numCols()];
for (int i = 0; i < newNames.length; i++) {
newNames[i] = str() + "(" + fr.name(i) + ")";
}
return new ValFrame(new MRTask() {
@Override
public void map(Chunk[] cs, NewChunk[] ncs) {
for (int col = 0; col < cs.length; col++) {
Chunk c = cs[col];
NewChunk nc = ncs[col];
for (int i = 0; i < c._len; i++) nc.addNum(op(c.atd(i)));
}
}
}.doAll(fr.numCols(), Vec.T_NUM, fr).outputFrame(newNames, null));
case Val.ROW:
double[] ds = new double[val.getRow().length];
for (int i = 0; i < ds.length; ++i) ds[i] = op(val.getRow()[i]);
String[] names = ((ValRow) val).getNames().clone();
return new ValRow(ds, names);
default:
throw H2O.unimpl("unop unimpl: " + val.getClass());
}
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class AstHist method apply.
@Override
public ValFrame apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
// stack is [ ..., ary, breaks]
// handle the breaks
Frame fr2;
Frame f = stk.track(asts[1].exec(env)).getFrame();
if (f.numCols() != 1)
throw new IllegalArgumentException("Hist only applies to single numeric columns.");
Vec vec = f.anyVec();
if (!vec.isNumeric())
throw new IllegalArgumentException("Hist only applies to single numeric columns.");
//TODO Add case when vec is a constant numeric
if (vec.isConst())
throw new IllegalArgumentException("Hist does not apply to constant numeric columns.");
AstRoot a = asts[2];
String algo = null;
int numBreaks = -1;
double[] breaks = null;
if (a instanceof AstStr)
algo = a.str().toLowerCase();
else if (a instanceof AstNumList)
breaks = ((AstNumList) a).expand();
else if (a instanceof AstNum)
numBreaks = (int) a.exec(env).getNum();
AstHist.HistTask t;
double h;
double x1 = vec.max();
double x0 = vec.min();
if (breaks != null)
t = new AstHist.HistTask(breaks, -1, -1).doAll(vec);
else if (algo != null) {
switch(algo) {
case "sturges":
numBreaks = sturges(vec);
h = (x1 - x0) / numBreaks;
break;
case "rice":
numBreaks = rice(vec);
h = (x1 - x0) / numBreaks;
break;
case "sqrt":
numBreaks = sqrt(vec);
h = (x1 - x0) / numBreaks;
break;
case "doane":
numBreaks = doane(vec);
h = (x1 - x0) / numBreaks;
break;
case "scott":
h = scotts_h(vec);
numBreaks = scott(vec, h);
// special bin width computation
break;
case "fd":
h = fds_h(vec);
numBreaks = fd(vec, h);
// special bin width computation
break;
default:
numBreaks = sturges(vec);
// just do sturges even if junk passed in
h = (x1 - x0) / numBreaks;
}
t = new AstHist.HistTask(computeCuts(vec, numBreaks), h, x0).doAll(vec);
} else {
h = (x1 - x0) / numBreaks;
t = new AstHist.HistTask(computeCuts(vec, numBreaks), h, x0).doAll(vec);
}
// wanna make a new frame here [breaks,counts,mids]
final double[] brks = t._breaks;
final long[] cnts = t._counts;
final double[] mids_true = t._mids;
final double[] mids = new double[t._breaks.length - 1];
for (int i = 1; i < brks.length; ++i) mids[i - 1] = .5 * (t._breaks[i - 1] + t._breaks[i]);
Vec layoutVec = Vec.makeZero(brks.length);
fr2 = new MRTask() {
@Override
public void map(Chunk[] c, NewChunk[] nc) {
int start = (int) c[0].start();
for (int i = 0; i < c[0]._len; ++i) {
nc[0].addNum(brks[i + start]);
if (i == 0) {
nc[1].addNA();
nc[2].addNA();
nc[3].addNA();
} else {
nc[1].addNum(cnts[(i - 1) + start]);
nc[2].addNum(mids_true[(i - 1) + start]);
nc[3].addNum(mids[(i - 1) + start]);
}
}
}
}.doAll(4, Vec.T_NUM, new Frame(layoutVec)).outputFrame(null, new String[] { "breaks", "counts", "mids_true", "mids" }, null);
layoutVec.remove();
return new ValFrame(fr2);
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class AstKFold method stratifiedKFoldColumn.
public static Vec stratifiedKFoldColumn(Vec y, final int nfolds, final long seed) {
// therefore, have a seed per class to be used by the map call
if (!(y.isCategorical() || (y.isNumeric() && y.isInt())))
throw new IllegalArgumentException("stratification only applies to integer and categorical columns. Got: " + y.get_type_str());
final long[] classes = new VecUtils.CollectDomain().doAll(y).domain();
final int nClass = y.isNumeric() ? classes.length : y.domain().length;
// seed for each regular fold column (one per class)
final long[] seeds = new long[nClass];
for (int i = 0; i < nClass; ++i) seeds[i] = getRNG(seed + i).nextLong();
return new MRTask() {
private int getFoldId(long absoluteRow, long seed) {
return Math.abs(getRNG(absoluteRow + seed).nextInt()) % nfolds;
}
// dress up the foldColumn (y[1]) as follows:
// 1. For each testFold and each classLabel loop over the response column (y[0])
// 2. If the classLabel is the current response and the testFold is the foldId
// for the current row and classLabel, then set the foldColumn to testFold
//
// How this balances labels per fold:
// Imagine that a KFold column was generated for each class. Observe that this
// makes the outer loop a way of selecting only the test rows from each fold
// (i.e., the holdout rows). Each fold is balanced sequentially in this way
// since y[1] is only updated if the current row happens to be a holdout row
// for the given classLabel.
//
// Next observe that looping over each classLabel filters down each KFold
// so that it contains labels for just THAT class. This is how the balancing
// can be made so that it is independent of the chunk distribution and the
// per chunk class distribution.
//
// Downside is this performs nfolds*nClass passes over each Chunk. For
// "reasonable" classification problems, this could be 100 passes per Chunk.
@Override
public void map(Chunk[] y) {
long start = y[0].start();
for (int testFold = 0; testFold < nfolds; ++testFold) {
for (int classLabel = 0; classLabel < nClass; ++classLabel) {
for (int row = 0; row < y[0]._len; ++row) {
// missing response gets spread around
if (y[0].isNA(row)) {
if ((start + row) % nfolds == testFold)
y[1].set(row, testFold);
} else {
if (y[0].at8(row) == (classes == null ? classLabel : classes[classLabel])) {
if (testFold == getFoldId(start + row, seeds[classLabel]))
y[1].set(row, testFold);
}
}
}
}
}
}
}.doAll(new Frame(y, y.makeZero()))._fr.vec(1);
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class AstUnique method apply.
@Override
public ValFrame apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
Frame fr = stk.track(asts[1].exec(env)).getFrame();
Vec vec0 = fr.vec(0);
Vec v;
if (fr.numCols() != 1)
throw new IllegalArgumentException("Unique applies to a single column only.");
if (vec0.isCategorical()) {
v = Vec.makeSeq(0, (long) vec0.domain().length, true);
v.setDomain(vec0.domain());
DKV.put(v);
} else {
UniqTask t = new UniqTask().doAll(fr);
int nUniq = t._uniq.size();
final AstGroup.G[] uniq = t._uniq.keySet().toArray(new AstGroup.G[nUniq]);
v = Vec.makeZero(nUniq, vec0.get_type());
new MRTask() {
@Override
public void map(Chunk c) {
int start = (int) c.start();
for (int i = 0; i < c._len; ++i) c.set(i, uniq[i + start]._gs[0]);
}
}.doAll(v);
}
return new ValFrame(new Frame(v));
}
Aggregations