Search in sources :

Example 46 with Chunk

use of water.fvec.Chunk in project h2o-2 by h2oai.

the class GBM method buildNextKTrees.

// --------------------------------------------------------------------------
// Build the next k-trees, which is trying to correct the residual error from
// the prior trees.  From LSE2, page 387.  Step 2b ii, iii.
private DTree[] buildNextKTrees(Frame fr) {
    // We're going to build K (nclass) trees - each focused on correcting
    // errors for a single class.
    final DTree[] ktrees = new DTree[_nclass];
    // Initial set of histograms.  All trees; one leaf per tree (the root
    // leaf); all columns
    DHistogram[][][] hcs = new DHistogram[_nclass][1][_ncols];
    // Adjust nbins for the top-levels
    int adj_nbins = Math.max((1 << (10 - 0)), nbins);
    for (int k = 0; k < _nclass; k++) {
        // Initially setup as-if an empty-split had just happened
        if (_distribution == null || _distribution[k] != 0) {
            // DRF picks a random different set of columns for the 2nd tree.
            if (k == 1 && _nclass == 2)
                continue;
            ktrees[k] = new DTree(fr._names, _ncols, (char) nbins, (char) _nclass, min_rows);
            // The "root" node
            new GBMUndecidedNode(ktrees[k], -1, DHistogram.initialHist(fr, _ncols, adj_nbins, hcs[k][0], min_rows, group_split, false));
        }
    }
    // Define a "working set" of leaf splits, from here to tree._len
    int[] leafs = new int[_nclass];
    // ----
    // ESL2, page 387.  Step 2b ii.
    // One Big Loop till the ktrees are of proper depth.
    // Adds a layer to the trees each pass.
    int depth = 0;
    for (; depth < max_depth; depth++) {
        if (!Job.isRunning(self()))
            return null;
        hcs = buildLayer(fr, ktrees, leafs, hcs, false, false);
        // If we did not make any new splits, then the tree is split-to-death
        if (hcs == null)
            break;
    }
    // LeafNodes to hold predictions.
    for (int k = 0; k < _nclass; k++) {
        DTree tree = ktrees[k];
        if (tree == null)
            continue;
        int leaf = leafs[k] = tree.len();
        for (int nid = 0; nid < leaf; nid++) {
            if (tree.node(nid) instanceof DecidedNode) {
                DecidedNode dn = tree.decided(nid);
                for (int i = 0; i < dn._nids.length; i++) {
                    int cnid = dn._nids[i];
                    if (// Bottomed out (predictors or responses known constant)
                    cnid == -1 || // Or chopped off for depth
                    tree.node(cnid) instanceof UndecidedNode || (// Or not possible to split
                    tree.node(cnid) instanceof DecidedNode && ((DecidedNode) tree.node(cnid))._split.col() == -1))
                        // Mark a leaf here
                        dn._nids[i] = new GBMLeafNode(tree, nid).nid();
                }
                // Handle the trivial non-splitting tree
                if (nid == 0 && dn._split.col() == -1)
                    new GBMLeafNode(tree, -1, 0);
            }
        }
    }
    // -- k-trees are done
    // ----
    // ESL2, page 387.  Step 2b iii.  Compute the gammas, and store them back
    // into the tree leaves.  Includes learn_rate.
    // For classification (bernoulli):
    //    gamma_i = sum res_i / sum p_i*(1 - p_i) where p_i = y_i - res_i
    // For classification (multinomial):
    //    gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|)))
    // For regression (gaussian):
    //    gamma_i = sum res_i / count(res_i)
    GammaPass gp = new GammaPass(ktrees, leafs).doAll(fr);
    // K-1/K for multinomial
    double m1class = _nclass > 1 && family != Family.bernoulli ? (double) (_nclass - 1) / _nclass : 1.0;
    for (int k = 0; k < _nclass; k++) {
        final DTree tree = ktrees[k];
        if (tree == null)
            continue;
        for (int i = 0; i < tree._len - leafs[k]; i++) {
            double g = // Constant response?
            gp._gss[k][i] == 0 ? // Cap (exponential) learn, instead of dealing with Inf
            (gp._rss[k][i] == 0 ? 0 : 1000) : learn_rate * m1class * gp._rss[k][i] / gp._gss[k][i];
            assert !Double.isNaN(g);
            ((LeafNode) tree.node(leafs[k] + i))._pred = g;
        }
    }
    // ----
    // ESL2, page 387.  Step 2b iv.  Cache the sum of all the trees, plus the
    // new tree, in the 'tree' columns.  Also, zap the NIDs for next pass.
    // Tree <== f(Tree)
    // Nids <== 0
    new MRTask2() {

        @Override
        public void map(Chunk[] chks) {
            // For all tree/klasses
            for (int k = 0; k < _nclass; k++) {
                final DTree tree = ktrees[k];
                if (tree == null)
                    continue;
                final Chunk nids = chk_nids(chks, k);
                final Chunk ct = chk_tree(chks, k);
                for (int row = 0; row < nids._len; row++) {
                    int nid = (int) nids.at80(row);
                    if (nid < 0)
                        continue;
                    // Prediction stored in Leaf is cut to float to be deterministic in reconstructing
                    // <tree_klazz> fields from tree prediction
                    ct.set0(row, (float) (ct.at0(row) + (float) ((LeafNode) tree.node(nid))._pred));
                    nids.set0(row, 0);
                }
            }
        }
    }.doAll(fr);
    // Collect leaves stats
    for (int i = 0; i < ktrees.length; i++) if (ktrees[i] != null)
        ktrees[i].leaves = ktrees[i].len() - leafs[i];
    return ktrees;
}
Also used : UndecidedNode(hex.gbm.DTree.UndecidedNode) DecidedNode(hex.gbm.DTree.DecidedNode) Chunk(water.fvec.Chunk) LeafNode(hex.gbm.DTree.LeafNode)

Example 47 with Chunk

use of water.fvec.Chunk in project h2o-2 by h2oai.

the class GBM method initWorkFrame.

@Override
protected void initWorkFrame(GBMModel initialModel, Frame fr) {
    // Tag out rows missing the response column
    new ExcludeNAResponse().doAll(fr);
    // Initial value is mean(y)
    final double mean = (float) fr.vec(initialModel.responseName()).mean();
    // Initialize working response based on given loss function
    if (_nclass == 1) {
        /* regression */
        // Regression initially predicts the response mean
        initialModel.initialPrediction = mean;
        new MRTask2() {

            @Override
            public void map(Chunk[] chks) {
                // there is only one tree for regression
                Chunk tr = chk_tree(chks, 0);
                for (int i = 0; i < tr._len; i++) tr.set0(i, mean);
            }
        }.doAll(fr);
    } else if (family == Family.bernoulli) {
        // Initial value is log( mean(y)/(1-mean(y)) )
        final float init = (float) Math.log(mean / (1.0f - mean));
        initialModel.initialPrediction = init;
        new MRTask2() {

            @Override
            public void map(Chunk[] chks) {
                // only the tree for y = 0 is used
                Chunk tr = chk_tree(chks, 0);
                for (int i = 0; i < tr._len; i++) tr.set0(i, init);
            }
        }.doAll(fr);
    } else {
    /* multinomial */
    /* Preserve 0s in working columns */
    }
    // Update tree fields based on checkpoint
    if (checkpoint != null) {
        Timer t = new Timer();
        new ResidualsCollector(_ncols, _nclass, initialModel.treeKeys).doAll(fr);
        Log.info(logTag(), "Reconstructing tree residuals stats from checkpointed model took " + t);
    }
}
Also used : Chunk(water.fvec.Chunk)

Example 48 with Chunk

use of water.fvec.Chunk in project h2o-2 by h2oai.

the class ResidualsCollector method map.

@Override
public void map(Chunk[] chks) {
    double[] data = new double[_ncols];
    float[] preds = new float[_nclass + 1];
    int ntrees = _trees.length;
    Chunk cys = chk_resp(chks);
    for (int tidx = 0; tidx < ntrees; tidx++) {
        // tree
        for (int row = 0; row < cys._len; row++) {
            // Make a prediction
            for (int i = 0; i < _ncols; i++) data[i] = chks[i].at0(row);
            Arrays.fill(preds, 0);
            score0(data, preds, _trees[tidx]);
            // regression shortcut
            if (_nclass == 1)
                preds[1] = preds[0];
            // Write tree predictions
            for (int c = 0; c < _nclass; c++) {
                // over all class
                if (preds[1 + c] != 0) {
                    Chunk ctree = chk_tree(chks, c);
                    ctree.set0(row, (float) (ctree.at0(row) + preds[1 + c]));
                }
            }
        }
    }
}
Also used : Chunk(water.fvec.Chunk)

Example 49 with Chunk

use of water.fvec.Chunk in project h2o-3 by h2oai.

the class DeepLearningModel method scoreDeepFeatures.

public Frame scoreDeepFeatures(Frame frame, final int layer, final Job job) {
    if (layer < 0 || layer >= model_info().get_params()._hidden.length)
        throw new H2OIllegalArgumentException("hidden layer (index) to extract must be between " + 0 + " and " + (model_info().get_params()._hidden.length - 1), "");
    final int len = _output.nfeatures();
    if (isSupervised()) {
        int ridx = frame.find(_output.responseName());
        if (ridx != -1) {
            // drop the response for scoring!
            frame = new Frame(frame);
            frame.remove(ridx);
        }
    }
    Frame adaptFrm = new Frame(frame);
    //create new features, will be dense
    final int features = model_info().get_params()._hidden[layer];
    Vec v = adaptFrm.anyVec();
    Vec[] vecs = v != null ? v.makeZeros(features) : null;
    if (vecs == null)
        throw new IllegalArgumentException("Cannot create deep features from a frame with no columns.");
    Scope.enter();
    adaptTestForTrain(adaptFrm, true, false);
    for (int j = 0; j < features; ++j) {
        adaptFrm.add("DF.L" + (layer + 1) + ".C" + (j + 1), vecs[j]);
    }
    final int mb = 0;
    final int n = 1;
    new MRTask() {

        @Override
        public void map(Chunk[] chks) {
            if (isCancelled() || job != null && job.stop_requested())
                return;
            double[] tmp = new double[len];
            final Neurons[] neurons = DeepLearningTask.makeNeuronsForTesting(model_info);
            for (int row = 0; row < chks[0]._len; row++) {
                for (int i = 0; i < len; i++) tmp[i] = chks[i].atd(row);
                //FIXME: No weights yet
                ((Neurons.Input) neurons[0]).setInput(-1, tmp, mb);
                DeepLearningTask.fpropMiniBatch(-1, neurons, model_info, null, false, null, null, /*no offset*/
                n);
                //extract the layer-th hidden feature
                double[] out = neurons[layer + 1]._a[mb].raw();
                for (int c = 0; c < features; c++) chks[_output._names.length + c].set(row, out[c]);
            }
            if (job != null)
                job.update(1);
        }
    }.doAll(adaptFrm);
    // Return just the output columns
    int x = _output._names.length, y = adaptFrm.numCols();
    Frame ret = adaptFrm.extractFrame(x, y);
    Scope.exit();
    return ret;
}
Also used : Frame(water.fvec.Frame) Vec(water.fvec.Vec) H2OIllegalArgumentException(water.exceptions.H2OIllegalArgumentException) Chunk(water.fvec.Chunk) NewChunk(water.fvec.NewChunk) H2OIllegalArgumentException(water.exceptions.H2OIllegalArgumentException)

Example 50 with Chunk

use of water.fvec.Chunk in project h2o-3 by h2oai.

the class Score method map.

@Override
public void map(Chunk[] chks) {
    // Response
    Chunk ys = _bldr.chk_resp(chks);
    Model m = _bldr._model;
    Chunk weightsChunk = m._output.hasWeights() ? chks[m._output.weightsIdx()] : null;
    Chunk offsetChunk = m._output.hasOffset() ? chks[m._output.offsetIdx()] : null;
    final int nclass = _bldr.nclasses();
    // Because of adaption - the validation training set has at least as many
    // classes as the training set (it may have more).  The Confusion Matrix
    // needs to be at least as big as the training set domain.
    String[] domain = _kresp.get().domain();
    // If this is a score-on-train AND DRF, then oobColIdx makes sense,
    // otherwise this field is unused.
    final int oobColIdx = _bldr.idx_oobt();
    _mb = m.makeMetricBuilder(domain);
    //    _gainsLiftBuilder = _bldr._model._output.nclasses()==2 ? new GainsLift.GainsLiftBuilder(_fr.vec(_bldr.idx_tree(0)).pctiles()) : null;
    // Temp working array for class distributions
    final double[] cdists = _mb._work;
    // If working a validation set, need to push thru official model scoring
    // logic which requires a temp array to hold the features.
    final double[] tmp = _is_train && _bldr._ntrees > 0 ? null : new double[_bldr._ncols];
    //    final double[] tmp = new double[_bldr._ncols];
    // Score all Rows
    float[] val = new float[1];
    for (int row = 0; row < ys._len; row++) {
        // Ignore missing response vars only if it was actual NA
        if (ys.isNA(row))
            continue;
        // Ignore out-of-bag rows
        if (_oob && chks[oobColIdx].atd(row) == 0)
            continue;
        double weight = weightsChunk != null ? weightsChunk.atd(row) : 1;
        //ignore holdout rows
        if (weight == 0)
            continue;
        double offset = offsetChunk != null ? offsetChunk.atd(row) : 0;
        if (// Passed in the model-specific columns
        _is_train)
            // Use the training data directly (per-row predictions already made)
            _bldr.score2(chks, weight, offset, cdists, row);
        else
            // Must score "the hard way"
            m.score0(chks, weight, offset, row, tmp, cdists);
        // fill tmp with training data for null model - to have proper tie breaking
        if (_is_train && _bldr._ntrees == 0)
            for (int i = 0; i < tmp.length; i++) tmp[i] = chks[i].atd(row);
        // Fill in prediction
        if (nclass > 1)
            cdists[0] = GenModel.getPrediction(cdists, m._output._priorClassDist, tmp, m.defaultThreshold());
        val[0] = (float) ys.atd(row);
        _mb.perRow(cdists, val, weight, offset, m);
    }
}
Also used : GenModel(hex.genmodel.GenModel) Chunk(water.fvec.Chunk)

Aggregations

Chunk (water.fvec.Chunk)74 Frame (water.fvec.Frame)50 NewChunk (water.fvec.NewChunk)36 MRTask (water.MRTask)33 Vec (water.fvec.Vec)30 ValFrame (water.rapids.vals.ValFrame)26 C0DChunk (water.fvec.C0DChunk)7 BufferedString (water.parser.BufferedString)7 Random (java.util.Random)6 Test (org.junit.Test)5 MRTask2 (water.MRTask2)4 Val (water.rapids.Val)4 Key (water.Key)3 H2OIllegalArgumentException (water.exceptions.H2OIllegalArgumentException)3 AstRoot (water.rapids.ast.AstRoot)3 AstNumList (water.rapids.ast.params.AstNumList)3 File (java.io.File)2 IOException (java.io.IOException)2 ValNum (water.rapids.vals.ValNum)2 PrettyPrint (water.util.PrettyPrint)2