Search in sources :

Example 1 with UndecidedNode

use of hex.gbm.DTree.UndecidedNode in project h2o-2 by h2oai.

the class DRF method buildNextKTrees.

// --------------------------------------------------------------------------
// Build the next random k-trees representing tid-th tree
private DTree[] buildNextKTrees(Frame fr, int mtrys, float sample_rate, Random rand, int tid) {
    // We're going to build K (nclass) trees - each focused on correcting
    // errors for a single class.
    final DTree[] ktrees = new DTree[_nclass];
    // Initial set of histograms.  All trees; one leaf per tree (the root
    // leaf); all columns
    DHistogram[][][] hcs = new DHistogram[_nclass][1][_ncols];
    // Adjust nbins for the top-levels
    int adj_nbins = Math.max((1 << (10 - 0)), nbins);
    // Use for all k-trees the same seed. NOTE: this is only to make a fair
    // view for all k-trees
    long rseed = rand.nextLong();
    // Initially setup as-if an empty-split had just happened
    for (int k = 0; k < _nclass; k++) {
        assert (_distribution != null && classification) || (_distribution == null && !classification);
        if (_distribution == null || _distribution[k] != 0) {
            // Ignore missing classes
            // The Boolean Optimization cannot be applied here for RF !
            // This optimization assumes the 2nd tree of a 2-class system is the
            // inverse of the first.  This is false for DRF (and true for GBM) -
            // DRF picks a random different set of columns for the 2nd tree.
            //if( k==1 && _nclass==2 ) continue;
            ktrees[k] = new DRFTree(fr, _ncols, (char) nbins, (char) _nclass, min_rows, mtrys, rseed);
            boolean isBinom = classification;
            // The "root" node
            new DRFUndecidedNode(ktrees[k], -1, DHistogram.initialHist(fr, _ncols, adj_nbins, hcs[k][0], min_rows, do_grpsplit, isBinom));
        }
    }
    // Sample - mark the lines by putting 'OUT_OF_BAG' into nid(<klass>) vector
    Timer t_1 = new Timer();
    Sample[] ss = new Sample[_nclass];
    for (int k = 0; k < _nclass; k++) if (ktrees[k] != null)
        ss[k] = new Sample((DRFTree) ktrees[k], sample_rate).dfork(0, new Frame(vec_nids(fr, k), vec_resp(fr, k)), build_tree_one_node);
    for (int k = 0; k < _nclass; k++) if (ss[k] != null)
        ss[k].getResult();
    Log.debug(Sys.DRF__, "Sampling took: + " + t_1);
    // Define a "working set" of leaf splits, from leafs[i] to tree._len for each tree i
    int[] leafs = new int[_nclass];
    // ----
    // One Big Loop till the ktrees are of proper depth.
    // Adds a layer to the trees each pass.
    Timer t_2 = new Timer();
    int depth = 0;
    for (; depth < max_depth; depth++) {
        if (!Job.isRunning(self()))
            return null;
        hcs = buildLayer(fr, ktrees, leafs, hcs, true, build_tree_one_node);
        // If we did not make any new splits, then the tree is split-to-death
        if (hcs == null)
            break;
    }
    Log.debug(Sys.DRF__, "Tree build took: " + t_2);
    // Each tree bottomed-out in a DecidedNode; go 1 more level and insert
    // LeafNodes to hold predictions.
    Timer t_3 = new Timer();
    for (int k = 0; k < _nclass; k++) {
        DTree tree = ktrees[k];
        if (tree == null)
            continue;
        int leaf = leafs[k] = tree.len();
        for (int nid = 0; nid < leaf; nid++) {
            if (tree.node(nid) instanceof DecidedNode) {
                DecidedNode dn = tree.decided(nid);
                for (int i = 0; i < dn._nids.length; i++) {
                    int cnid = dn._nids[i];
                    if (// Bottomed out (predictors or responses known constant)
                    cnid == -1 || // Or chopped off for depth
                    tree.node(cnid) instanceof UndecidedNode || (// Or not possible to split
                    tree.node(cnid) instanceof DecidedNode && ((DecidedNode) tree.node(cnid))._split.col() == -1)) {
                        LeafNode ln = new DRFLeafNode(tree, nid);
                        // Set prediction into the leaf
                        ln._pred = dn.pred(i);
                        // Mark a leaf here
                        dn._nids[i] = ln.nid();
                    }
                }
                // Handle the trivial non-splitting tree
                if (nid == 0 && dn._split.col() == -1)
                    new DRFLeafNode(tree, -1, 0);
            }
        }
    }
    // -- k-trees are done
    Log.debug(Sys.DRF__, "Nodes propagation: " + t_3);
    // ----
    // Move rows into the final leaf rows
    Timer t_4 = new Timer();
    CollectPreds cp = new CollectPreds(ktrees, leafs).doAll(fr, build_tree_one_node);
    if (importance) {
        if (// Track right votes over OOB rows for this tree
        classification)
            // Track right votes over OOB rows for this tree
            asVotes(_treeMeasuresOnOOB).append(cp.rightVotes, cp.allRows);
        else
            /* regression */
            asSSE(_treeMeasuresOnOOB).append(cp.sse, cp.allRows);
    }
    Log.debug(Sys.DRF__, "CollectPreds done: " + t_4);
    // Collect leaves stats
    for (int i = 0; i < ktrees.length; i++) if (ktrees[i] != null)
        ktrees[i].leaves = ktrees[i].len() - leafs[i];
    return ktrees;
}
Also used : Frame(water.fvec.Frame) UndecidedNode(hex.gbm.DTree.UndecidedNode) DTree(hex.gbm.DTree) DecidedNode(hex.gbm.DTree.DecidedNode) DHistogram(hex.gbm.DHistogram) LeafNode(hex.gbm.DTree.LeafNode)

Example 2 with UndecidedNode

use of hex.gbm.DTree.UndecidedNode in project h2o-2 by h2oai.

the class GBM method buildNextKTrees.

// --------------------------------------------------------------------------
// Build the next k-trees, which is trying to correct the residual error from
// the prior trees.  From LSE2, page 387.  Step 2b ii, iii.
private DTree[] buildNextKTrees(Frame fr) {
    // We're going to build K (nclass) trees - each focused on correcting
    // errors for a single class.
    final DTree[] ktrees = new DTree[_nclass];
    // Initial set of histograms.  All trees; one leaf per tree (the root
    // leaf); all columns
    DHistogram[][][] hcs = new DHistogram[_nclass][1][_ncols];
    // Adjust nbins for the top-levels
    int adj_nbins = Math.max((1 << (10 - 0)), nbins);
    for (int k = 0; k < _nclass; k++) {
        // Initially setup as-if an empty-split had just happened
        if (_distribution == null || _distribution[k] != 0) {
            // DRF picks a random different set of columns for the 2nd tree.
            if (k == 1 && _nclass == 2)
                continue;
            ktrees[k] = new DTree(fr._names, _ncols, (char) nbins, (char) _nclass, min_rows);
            // The "root" node
            new GBMUndecidedNode(ktrees[k], -1, DHistogram.initialHist(fr, _ncols, adj_nbins, hcs[k][0], min_rows, group_split, false));
        }
    }
    // Define a "working set" of leaf splits, from here to tree._len
    int[] leafs = new int[_nclass];
    // ----
    // ESL2, page 387.  Step 2b ii.
    // One Big Loop till the ktrees are of proper depth.
    // Adds a layer to the trees each pass.
    int depth = 0;
    for (; depth < max_depth; depth++) {
        if (!Job.isRunning(self()))
            return null;
        hcs = buildLayer(fr, ktrees, leafs, hcs, false, false);
        // If we did not make any new splits, then the tree is split-to-death
        if (hcs == null)
            break;
    }
    // LeafNodes to hold predictions.
    for (int k = 0; k < _nclass; k++) {
        DTree tree = ktrees[k];
        if (tree == null)
            continue;
        int leaf = leafs[k] = tree.len();
        for (int nid = 0; nid < leaf; nid++) {
            if (tree.node(nid) instanceof DecidedNode) {
                DecidedNode dn = tree.decided(nid);
                for (int i = 0; i < dn._nids.length; i++) {
                    int cnid = dn._nids[i];
                    if (// Bottomed out (predictors or responses known constant)
                    cnid == -1 || // Or chopped off for depth
                    tree.node(cnid) instanceof UndecidedNode || (// Or not possible to split
                    tree.node(cnid) instanceof DecidedNode && ((DecidedNode) tree.node(cnid))._split.col() == -1))
                        // Mark a leaf here
                        dn._nids[i] = new GBMLeafNode(tree, nid).nid();
                }
                // Handle the trivial non-splitting tree
                if (nid == 0 && dn._split.col() == -1)
                    new GBMLeafNode(tree, -1, 0);
            }
        }
    }
    // -- k-trees are done
    // ----
    // ESL2, page 387.  Step 2b iii.  Compute the gammas, and store them back
    // into the tree leaves.  Includes learn_rate.
    // For classification (bernoulli):
    //    gamma_i = sum res_i / sum p_i*(1 - p_i) where p_i = y_i - res_i
    // For classification (multinomial):
    //    gamma_i_k = (nclass-1)/nclass * (sum res_i / sum (|res_i|*(1-|res_i|)))
    // For regression (gaussian):
    //    gamma_i = sum res_i / count(res_i)
    GammaPass gp = new GammaPass(ktrees, leafs).doAll(fr);
    // K-1/K for multinomial
    double m1class = _nclass > 1 && family != Family.bernoulli ? (double) (_nclass - 1) / _nclass : 1.0;
    for (int k = 0; k < _nclass; k++) {
        final DTree tree = ktrees[k];
        if (tree == null)
            continue;
        for (int i = 0; i < tree._len - leafs[k]; i++) {
            double g = // Constant response?
            gp._gss[k][i] == 0 ? // Cap (exponential) learn, instead of dealing with Inf
            (gp._rss[k][i] == 0 ? 0 : 1000) : learn_rate * m1class * gp._rss[k][i] / gp._gss[k][i];
            assert !Double.isNaN(g);
            ((LeafNode) tree.node(leafs[k] + i))._pred = g;
        }
    }
    // ----
    // ESL2, page 387.  Step 2b iv.  Cache the sum of all the trees, plus the
    // new tree, in the 'tree' columns.  Also, zap the NIDs for next pass.
    // Tree <== f(Tree)
    // Nids <== 0
    new MRTask2() {

        @Override
        public void map(Chunk[] chks) {
            // For all tree/klasses
            for (int k = 0; k < _nclass; k++) {
                final DTree tree = ktrees[k];
                if (tree == null)
                    continue;
                final Chunk nids = chk_nids(chks, k);
                final Chunk ct = chk_tree(chks, k);
                for (int row = 0; row < nids._len; row++) {
                    int nid = (int) nids.at80(row);
                    if (nid < 0)
                        continue;
                    // Prediction stored in Leaf is cut to float to be deterministic in reconstructing
                    // <tree_klazz> fields from tree prediction
                    ct.set0(row, (float) (ct.at0(row) + (float) ((LeafNode) tree.node(nid))._pred));
                    nids.set0(row, 0);
                }
            }
        }
    }.doAll(fr);
    // Collect leaves stats
    for (int i = 0; i < ktrees.length; i++) if (ktrees[i] != null)
        ktrees[i].leaves = ktrees[i].len() - leafs[i];
    return ktrees;
}
Also used : UndecidedNode(hex.gbm.DTree.UndecidedNode) DecidedNode(hex.gbm.DTree.DecidedNode) Chunk(water.fvec.Chunk) LeafNode(hex.gbm.DTree.LeafNode)

Aggregations

DecidedNode (hex.gbm.DTree.DecidedNode)2 LeafNode (hex.gbm.DTree.LeafNode)2 UndecidedNode (hex.gbm.DTree.UndecidedNode)2 DHistogram (hex.gbm.DHistogram)1 DTree (hex.gbm.DTree)1 Chunk (water.fvec.Chunk)1 Frame (water.fvec.Frame)1