use of water.fvec.Chunk in project h2o-2 by h2oai.
the class GBM method buildNextKTrees.
// --------------------------------------------------------------------------
// Build the next k-trees, which is trying to correct the residual error from
// the prior trees. From LSE2, page 387. Step 2b ii, iii.
private DTree[] buildNextKTrees(Frame fr) {
// We're going to build K (nclass) trees - each focused on correcting
// errors for a single class.
final DTree[] ktrees = new DTree[_nclass];
// Initial set of histograms. All trees; one leaf per tree (the root
// leaf); all columns
DHistogram[][][] hcs = new DHistogram[_nclass][1][_ncols];
// Adjust nbins for the top-levels
int adj_nbins = Math.max((1 << (10 - 0)), nbins);
for (int k = 0; k < _nclass; k++) {
// Initially setup as-if an empty-split had just happened
if (_distribution == null || _distribution[k] != 0) {
// DRF picks a random different set of columns for the 2nd tree.
if (k == 1 && _nclass == 2)
continue;
ktrees[k] = new DTree(fr._names, _ncols, (char) nbins, (char) _nclass, min_rows);
// The "root" node
new GBMUndecidedNode(ktrees[k], -1, DHistogram.initialHist(fr, _ncols, adj_nbins, hcs[k][0], min_rows, group_split, false));
}
}
// Define a "working set" of leaf splits, from here to tree._len
int[] leafs = new int[_nclass];
// ----
// ESL2, page 387. Step 2b ii.
// One Big Loop till the ktrees are of proper depth.
// Adds a layer to the trees each pass.
int depth = 0;
for (; depth < max_depth; depth++) {
if (!Job.isRunning(self()))
return null;
hcs = buildLayer(fr, ktrees, leafs, hcs, false, false);
// If we did not make any new splits, then the tree is split-to-death
if (hcs == null)
break;
}
// LeafNodes to hold predictions.
for (int k = 0; k < _nclass; k++) {
DTree tree = ktrees[k];
if (tree == null)
continue;
int leaf = leafs[k] = tree.len();
for (int nid = 0; nid < leaf; nid++) {
if (tree.node(nid) instanceof DecidedNode) {
DecidedNode dn = tree.decided(nid);
for (int i = 0; i < dn._nids.length; i++) {
int cnid = dn._nids[i];
if (// Bottomed out (predictors or responses known constant)
cnid == -1 || // Or chopped off for depth
tree.node(cnid) instanceof UndecidedNode || (// Or not possible to split
tree.node(cnid) instanceof DecidedNode && ((DecidedNode) tree.node(cnid))._split.col() == -1))
// Mark a leaf here
dn._nids[i] = new GBMLeafNode(tree, nid).nid();
}
// Handle the trivial non-splitting tree
if (nid == 0 && dn._split.col() == -1)
new GBMLeafNode(tree, -1, 0);
}
}
}
// -- k-trees are done
// ----
// ESL2, page 387. Step 2b iii. Compute the gammas, and store them back
// into the tree leaves. Includes learn_rate.
// For classification (bernoulli):
// gamma_i = sum res_i / sum p_i*(1 - p_i) where p_i = y_i - res_i
// For classification (multinomial):
// gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|)))
// For regression (gaussian):
// gamma_i = sum res_i / count(res_i)
GammaPass gp = new GammaPass(ktrees, leafs).doAll(fr);
// K-1/K for multinomial
double m1class = _nclass > 1 && family != Family.bernoulli ? (double) (_nclass - 1) / _nclass : 1.0;
for (int k = 0; k < _nclass; k++) {
final DTree tree = ktrees[k];
if (tree == null)
continue;
for (int i = 0; i < tree._len - leafs[k]; i++) {
double g = // Constant response?
gp._gss[k][i] == 0 ? // Cap (exponential) learn, instead of dealing with Inf
(gp._rss[k][i] == 0 ? 0 : 1000) : learn_rate * m1class * gp._rss[k][i] / gp._gss[k][i];
assert !Double.isNaN(g);
((LeafNode) tree.node(leafs[k] + i))._pred = g;
}
}
// ----
// ESL2, page 387. Step 2b iv. Cache the sum of all the trees, plus the
// new tree, in the 'tree' columns. Also, zap the NIDs for next pass.
// Tree <== f(Tree)
// Nids <== 0
new MRTask2() {
@Override
public void map(Chunk[] chks) {
// For all tree/klasses
for (int k = 0; k < _nclass; k++) {
final DTree tree = ktrees[k];
if (tree == null)
continue;
final Chunk nids = chk_nids(chks, k);
final Chunk ct = chk_tree(chks, k);
for (int row = 0; row < nids._len; row++) {
int nid = (int) nids.at80(row);
if (nid < 0)
continue;
// Prediction stored in Leaf is cut to float to be deterministic in reconstructing
// <tree_klazz> fields from tree prediction
ct.set0(row, (float) (ct.at0(row) + (float) ((LeafNode) tree.node(nid))._pred));
nids.set0(row, 0);
}
}
}
}.doAll(fr);
// Collect leaves stats
for (int i = 0; i < ktrees.length; i++) if (ktrees[i] != null)
ktrees[i].leaves = ktrees[i].len() - leafs[i];
return ktrees;
}
use of water.fvec.Chunk in project h2o-2 by h2oai.
the class GBM method initWorkFrame.
@Override
protected void initWorkFrame(GBMModel initialModel, Frame fr) {
// Tag out rows missing the response column
new ExcludeNAResponse().doAll(fr);
// Initial value is mean(y)
final double mean = (float) fr.vec(initialModel.responseName()).mean();
// Initialize working response based on given loss function
if (_nclass == 1) {
/* regression */
// Regression initially predicts the response mean
initialModel.initialPrediction = mean;
new MRTask2() {
@Override
public void map(Chunk[] chks) {
// there is only one tree for regression
Chunk tr = chk_tree(chks, 0);
for (int i = 0; i < tr._len; i++) tr.set0(i, mean);
}
}.doAll(fr);
} else if (family == Family.bernoulli) {
// Initial value is log( mean(y)/(1-mean(y)) )
final float init = (float) Math.log(mean / (1.0f - mean));
initialModel.initialPrediction = init;
new MRTask2() {
@Override
public void map(Chunk[] chks) {
// only the tree for y = 0 is used
Chunk tr = chk_tree(chks, 0);
for (int i = 0; i < tr._len; i++) tr.set0(i, init);
}
}.doAll(fr);
} else {
/* multinomial */
/* Preserve 0s in working columns */
}
// Update tree fields based on checkpoint
if (checkpoint != null) {
Timer t = new Timer();
new ResidualsCollector(_ncols, _nclass, initialModel.treeKeys).doAll(fr);
Log.info(logTag(), "Reconstructing tree residuals stats from checkpointed model took " + t);
}
}
use of water.fvec.Chunk in project h2o-2 by h2oai.
the class ResidualsCollector method map.
@Override
public void map(Chunk[] chks) {
double[] data = new double[_ncols];
float[] preds = new float[_nclass + 1];
int ntrees = _trees.length;
Chunk cys = chk_resp(chks);
for (int tidx = 0; tidx < ntrees; tidx++) {
// tree
for (int row = 0; row < cys._len; row++) {
// Make a prediction
for (int i = 0; i < _ncols; i++) data[i] = chks[i].at0(row);
Arrays.fill(preds, 0);
score0(data, preds, _trees[tidx]);
// regression shortcut
if (_nclass == 1)
preds[1] = preds[0];
// Write tree predictions
for (int c = 0; c < _nclass; c++) {
// over all class
if (preds[1 + c] != 0) {
Chunk ctree = chk_tree(chks, c);
ctree.set0(row, (float) (ctree.at0(row) + preds[1 + c]));
}
}
}
}
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class DeepLearningModel method scoreDeepFeatures.
public Frame scoreDeepFeatures(Frame frame, final int layer, final Job job) {
if (layer < 0 || layer >= model_info().get_params()._hidden.length)
throw new H2OIllegalArgumentException("hidden layer (index) to extract must be between " + 0 + " and " + (model_info().get_params()._hidden.length - 1), "");
final int len = _output.nfeatures();
if (isSupervised()) {
int ridx = frame.find(_output.responseName());
if (ridx != -1) {
// drop the response for scoring!
frame = new Frame(frame);
frame.remove(ridx);
}
}
Frame adaptFrm = new Frame(frame);
//create new features, will be dense
final int features = model_info().get_params()._hidden[layer];
Vec v = adaptFrm.anyVec();
Vec[] vecs = v != null ? v.makeZeros(features) : null;
if (vecs == null)
throw new IllegalArgumentException("Cannot create deep features from a frame with no columns.");
Scope.enter();
adaptTestForTrain(adaptFrm, true, false);
for (int j = 0; j < features; ++j) {
adaptFrm.add("DF.L" + (layer + 1) + ".C" + (j + 1), vecs[j]);
}
final int mb = 0;
final int n = 1;
new MRTask() {
@Override
public void map(Chunk[] chks) {
if (isCancelled() || job != null && job.stop_requested())
return;
double[] tmp = new double[len];
final Neurons[] neurons = DeepLearningTask.makeNeuronsForTesting(model_info);
for (int row = 0; row < chks[0]._len; row++) {
for (int i = 0; i < len; i++) tmp[i] = chks[i].atd(row);
//FIXME: No weights yet
((Neurons.Input) neurons[0]).setInput(-1, tmp, mb);
DeepLearningTask.fpropMiniBatch(-1, neurons, model_info, null, false, null, null, /*no offset*/
n);
//extract the layer-th hidden feature
double[] out = neurons[layer + 1]._a[mb].raw();
for (int c = 0; c < features; c++) chks[_output._names.length + c].set(row, out[c]);
}
if (job != null)
job.update(1);
}
}.doAll(adaptFrm);
// Return just the output columns
int x = _output._names.length, y = adaptFrm.numCols();
Frame ret = adaptFrm.extractFrame(x, y);
Scope.exit();
return ret;
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class Score method map.
@Override
public void map(Chunk[] chks) {
// Response
Chunk ys = _bldr.chk_resp(chks);
Model m = _bldr._model;
Chunk weightsChunk = m._output.hasWeights() ? chks[m._output.weightsIdx()] : null;
Chunk offsetChunk = m._output.hasOffset() ? chks[m._output.offsetIdx()] : null;
final int nclass = _bldr.nclasses();
// Because of adaption - the validation training set has at least as many
// classes as the training set (it may have more). The Confusion Matrix
// needs to be at least as big as the training set domain.
String[] domain = _kresp.get().domain();
// If this is a score-on-train AND DRF, then oobColIdx makes sense,
// otherwise this field is unused.
final int oobColIdx = _bldr.idx_oobt();
_mb = m.makeMetricBuilder(domain);
// _gainsLiftBuilder = _bldr._model._output.nclasses()==2 ? new GainsLift.GainsLiftBuilder(_fr.vec(_bldr.idx_tree(0)).pctiles()) : null;
// Temp working array for class distributions
final double[] cdists = _mb._work;
// If working a validation set, need to push thru official model scoring
// logic which requires a temp array to hold the features.
final double[] tmp = _is_train && _bldr._ntrees > 0 ? null : new double[_bldr._ncols];
// final double[] tmp = new double[_bldr._ncols];
// Score all Rows
float[] val = new float[1];
for (int row = 0; row < ys._len; row++) {
// Ignore missing response vars only if it was actual NA
if (ys.isNA(row))
continue;
// Ignore out-of-bag rows
if (_oob && chks[oobColIdx].atd(row) == 0)
continue;
double weight = weightsChunk != null ? weightsChunk.atd(row) : 1;
//ignore holdout rows
if (weight == 0)
continue;
double offset = offsetChunk != null ? offsetChunk.atd(row) : 0;
if (// Passed in the model-specific columns
_is_train)
// Use the training data directly (per-row predictions already made)
_bldr.score2(chks, weight, offset, cdists, row);
else
// Must score "the hard way"
m.score0(chks, weight, offset, row, tmp, cdists);
// fill tmp with training data for null model - to have proper tie breaking
if (_is_train && _bldr._ntrees == 0)
for (int i = 0; i < tmp.length; i++) tmp[i] = chks[i].atd(row);
// Fill in prediction
if (nclass > 1)
cdists[0] = GenModel.getPrediction(cdists, m._output._priorClassDist, tmp, m.defaultThreshold());
val[0] = (float) ys.atd(row);
_mb.perRow(cdists, val, weight, offset, m);
}
}
Aggregations