Search in sources :

Example 6 with SplitEntry

use of com.tencent.angel.ml.GBDT.algo.tree.SplitEntry in project angel by Tencent.

the class GradHistHelper method findSplitOfFeature.

// find the best split result of one feature from a server row, used by the PS
public static SplitEntry findSplitOfFeature(int fid, ServerIntDoubleRow row, int startIdx, GBDTParam param) {
    LOG.debug(String.format("Find best split for fid[%d] in histogram size[%d], startIdx[%d]", fid, row.size(), startIdx));
    SplitEntry splitEntry = new SplitEntry();
    // 1. set the feature id
    splitEntry.setFid(fid);
    // 2. create the best left stats and right stats
    GradStats bestLeftStat = new GradStats();
    GradStats bestRightStat = new GradStats();
    GradStats rootStats = calGradStats(row, startIdx, param.numSplit);
    if (startIdx + 2 * param.numSplit <= row.getEndCol()) {
        // 3. the gain of the root node
        float rootGain = rootStats.calcGain(param);
        // 4. create the temp left and right grad stats
        GradStats leftStats = new GradStats();
        GradStats rightStats = new GradStats();
        // 5. loop over all the data in histogram
        for (int histIdx = startIdx; histIdx < startIdx + param.numSplit; histIdx++) {
            // 5.1. get the grad and hess of current hist bin
            float grad = (float) row.get(histIdx);
            float hess = (float) row.get(param.numSplit + histIdx);
            leftStats.add(grad, hess);
            // 5.2. check whether we can split with current left hessian
            if (leftStats.sumHess >= param.minChildWeight) {
                // right = root - left
                rightStats.setSubstract(rootStats, leftStats);
                // 5.3. check whether we can split with current right hessian
                if (rightStats.sumHess >= param.minChildWeight) {
                    // 5.4. calculate the current loss gain
                    float lossChg = leftStats.calcGain(param) + rightStats.calcGain(param) - rootGain;
                    // 5.5. check whether we should update the split result with current loss gain
                    // split rule: value <= split
                    int splitIdx = histIdx - startIdx;
                    // the task use index to find fvalue
                    if (splitEntry.update(lossChg, fid, splitIdx)) {
                        // 5.6. if should update, also update the best left and right grad stats
                        bestLeftStat.update(leftStats.sumGrad, leftStats.sumHess);
                        bestRightStat.update(rightStats.sumGrad, rightStats.sumHess);
                    }
                }
            }
        }
        // 6. set the best left and right grad stats
        splitEntry.leftGradStat = bestLeftStat;
        splitEntry.rightGradStat = bestRightStat;
    } else {
        LOG.error("index out of grad histogram size.");
    }
    return splitEntry;
}
Also used : SplitEntry(com.tencent.angel.ml.GBDT.algo.tree.SplitEntry)

Example 7 with SplitEntry

use of com.tencent.angel.ml.GBDT.algo.tree.SplitEntry in project angel by Tencent.

the class GBDTGradHistGetRowFunc method merge.

@Override
public GetResult merge(List<PartitionGetResult> partResults) {
    int size = partResults.size();
    List<ServerRow> rowSplits = new ArrayList<ServerRow>(size);
    for (int i = 0; i < size; i++) {
        rowSplits.add(((PartitionGetRowResult) partResults.get(i)).getRowSplit());
    }
    SplitEntry splitEntry = new SplitEntry();
    for (int i = 0; i < size; i++) {
        ServerIntDoubleRow row = (ServerIntDoubleRow) ((PartitionGetRowResult) partResults.get(i)).getRowSplit();
        int fid = (int) row.get(0 + (int) row.getStartCol());
        if (fid != -1) {
            int splitIndex = (int) row.get(1 + (int) row.getStartCol());
            float lossGain = (float) row.get(2 + (int) row.getStartCol());
            float leftSumGrad = (float) row.get(3 + (int) row.getStartCol());
            float leftSumHess = (float) row.get(4 + (int) row.getStartCol());
            float rightSumGrad = (float) row.get(5 + (int) row.getStartCol());
            float rightSumHess = (float) row.get(6 + (int) row.getStartCol());
            LOG.info(String.format("psFunc: the best split after looping a split: fid[%d], fvalue[%d], loss gain[%f]" + ", leftSumGrad[%f], leftSumHess[%f], rightSumGrad[%f], rightSumHess[%f]", fid, splitIndex, lossGain, leftSumGrad, leftSumHess, rightSumGrad, rightSumHess));
            GradStats curLeftGradStat = new GradStats(leftSumGrad, leftSumHess);
            GradStats curRightGradStat = new GradStats(rightSumGrad, rightSumHess);
            SplitEntry curSplitEntry = new SplitEntry(fid, splitIndex, lossGain);
            curSplitEntry.leftGradStat = curLeftGradStat;
            curSplitEntry.rightGradStat = curRightGradStat;
            splitEntry.update(curSplitEntry);
        }
    }
    return new GBDTGradHistGetRowResult(ResponseType.SUCCESS, splitEntry);
}
Also used : ServerIntDoubleRow(com.tencent.angel.ps.storage.vector.ServerIntDoubleRow) SplitEntry(com.tencent.angel.ml.GBDT.algo.tree.SplitEntry) ArrayList(java.util.ArrayList) GradStats(com.tencent.angel.ml.GBDT.algo.RegTree.GradStats) ServerRow(com.tencent.angel.ps.storage.vector.ServerRow)

Example 8 with SplitEntry

use of com.tencent.angel.ml.GBDT.algo.tree.SplitEntry in project angel by Tencent.

the class AfterSplitThread method run.

@Override
public void run() {
    int splitFeature = splitFeatureVec.get(nid);
    float splitValue = (float) splitValueVec.get(nid);
    float splitGain = (float) splitGainVec.get(nid);
    float nodeSumGrad = (float) nodeGradStatsVec.get(nid);
    float nodeSumHess = (float) nodeGradStatsVec.get(nid + this.controller.maxNodeNum);
    LOG.info(String.format("Active node[%d]: split feature[%d] value[%f], lossChg[%f], sumGrad[%f], sumHess[%f]", nid, splitFeature, splitValue, splitGain, nodeSumGrad, nodeSumHess));
    if (splitFeature != -1) {
        // 5.1. set the children nodes of this node
        this.controller.forest[this.controller.currentTree].nodes.get(nid).setLeftChild(2 * nid + 1);
        this.controller.forest[this.controller.currentTree].nodes.get(nid).setRightChild(2 * nid + 2);
        // 5.2. set split info and grad stats to this node
        SplitEntry splitEntry = new SplitEntry(splitFeature, splitValue, splitGain);
        this.controller.forest[this.controller.currentTree].stats.get(nid).setSplitEntry(splitEntry);
        this.controller.forest[this.controller.currentTree].stats.get(nid).lossChg = splitGain;
        this.controller.forest[this.controller.currentTree].stats.get(nid).setStats(nodeSumGrad, nodeSumHess);
        // 5.2. create children nodes
        TNode leftChild = new TNode(2 * nid + 1, nid, -1, -1);
        TNode rightChild = new TNode(2 * nid + 2, nid, -1, -1);
        this.controller.forest[this.controller.currentTree].nodes.set(2 * nid + 1, leftChild);
        this.controller.forest[this.controller.currentTree].nodes.set(2 * nid + 2, rightChild);
        LOG.debug(String.format("Add children nodes: %d and %d", 2 * nid + 1, 2 * nid + 2));
        // 5.3. create node stats for children nodes, and add them to the tree
        RegTNodeStat leftChildStat = new RegTNodeStat(this.controller.param);
        RegTNodeStat rightChildStat = new RegTNodeStat(this.controller.param);
        float leftChildSumGrad = (float) nodeGradStatsVec.get(2 * nid + 1);
        float rightChildSumGrad = (float) nodeGradStatsVec.get(2 * nid + 2);
        float leftChildSumHess = (float) nodeGradStatsVec.get(2 * nid + 1 + this.controller.maxNodeNum);
        float rightChildSumHess = (float) nodeGradStatsVec.get(2 * nid + 2 + this.controller.maxNodeNum);
        leftChildStat.setStats(leftChildSumGrad, leftChildSumHess);
        rightChildStat.setStats(rightChildSumGrad, rightChildSumHess);
        this.controller.forest[this.controller.currentTree].stats.set(2 * nid + 1, leftChildStat);
        this.controller.forest[this.controller.currentTree].stats.set(2 * nid + 2, rightChildStat);
        // 5.4. reset instance position
        this.controller.updateTrainInsPos(this.nid, splitFeature, splitValue);
        // 5.5. add new active nodes if possible, inc depth, otherwise finish this tree
        if (this.controller.currentDepth < this.controller.param.maxDepth - 1) {
            LOG.debug(String.format("Add children nodes of node[%d]:[%d][%d] to active nodes", nid, 2 * nid + 1, 2 * nid + 2));
            this.controller.addActiveNode(2 * nid + 1);
            this.controller.addActiveNode(2 * nid + 2);
        } else {
            // 5.6. set children nodes to leaf nodes
            LOG.debug(String.format("Set children nodes of node[%d]:[%d][%d] to leaf nodes", nid, 2 * nid + 1, 2 * nid + 2));
            this.controller.setNodeToLeaf(2 * nid + 1, leftChildStat.baseWeight);
            this.controller.setNodeToLeaf(2 * nid + 2, rightChildStat.baseWeight);
        }
    } else {
        // 5.7. set nid to leaf node
        this.controller.setNodeToLeaf(nid, this.controller.param.calcWeight(nodeSumGrad, nodeSumHess));
        LOG.debug(String.format("Set node %d to leaf", nid));
    }
    // 5.8. deactivate active node
    this.controller.resetActiveTNodes(nid);
}
Also used : TNode(com.tencent.angel.ml.GBDT.algo.tree.TNode) SplitEntry(com.tencent.angel.ml.GBDT.algo.tree.SplitEntry) RegTNodeStat(com.tencent.angel.ml.GBDT.algo.RegTree.RegTNodeStat)

Example 9 with SplitEntry

use of com.tencent.angel.ml.GBDT.algo.tree.SplitEntry in project angel by Tencent.

the class GBDTController method findSplit.

// find split
public void findSplit() throws Exception {
    LOG.info("------Find split------");
    long startTime = System.currentTimeMillis();
    // 1. find responsible tree node, using RR scheme
    List<Integer> responsibleTNode = new ArrayList<>();
    int activeTNodeNum = 0;
    for (int nid = 0; nid < this.activeNode.length; nid++) {
        int isActive = this.activeNode[nid];
        if (isActive == 1) {
            if (this.taskContext.getTaskIndex() == activeTNodeNum) {
                responsibleTNode.add(nid);
            }
            if (++activeTNodeNum >= taskContext.getTotalTaskNum()) {
                activeTNodeNum = 0;
            }
        }
    }
    int[] tNodeId = Maths.intList2Arr(responsibleTNode);
    LOG.info(String.format("Task[%d] responsible tree node: %s", this.taskContext.getTaskId().getIndex(), responsibleTNode.toString()));
    // 2. pull gradient histogram
    // the updated indices of the parameter on PS
    int[] updatedIndices = new int[tNodeId.length];
    // the updated split features
    int[] updatedSplitFid = new int[tNodeId.length];
    // the updated split value
    double[] updatedSplitFvalue = new double[tNodeId.length];
    // the updated split gain
    double[] updatedSplitGain = new double[tNodeId.length];
    boolean isServerSplit = taskContext.getConf().getBoolean(MLConf.ML_GBDT_SERVER_SPLIT(), MLConf.DEFAULT_ML_GBDT_SERVER_SPLIT());
    int splitNum = taskContext.getConf().getInt(MLConf.ML_GBDT_SPLIT_NUM(), MLConf.DEFAULT_ML_GBDT_SPLIT_NUM());
    for (int i = 0; i < tNodeId.length; i++) {
        int nid = tNodeId[i];
        LOG.debug(String.format("Task[%d] find best split of tree node: %d", this.taskContext.getTaskIndex(), nid));
        // 2.1. get the name of this node's gradient histogram on PS
        String gradHistName = this.param.gradHistNamePrefix + nid;
        // 2.2. pull the histogram
        long pullStartTime = System.currentTimeMillis();
        PSModel histMat = model.getPSModel(gradHistName);
        IntDoubleVector histogram = null;
        SplitEntry splitEntry = null;
        if (isServerSplit) {
            int matrixId = histMat.getMatrixId();
            GBDTGradHistGetRowFunc func = new GBDTGradHistGetRowFunc(new HistAggrParam(matrixId, 0, param.numSplit, param.minChildWeight, param.regAlpha, param.regLambda));
            splitEntry = ((GBDTGradHistGetRowResult) histMat.get(func)).getSplitEntry();
        } else {
            histogram = (IntDoubleVector) histMat.getRow(0);
            LOG.debug("Get grad histogram without server split mode, histogram size" + histogram.getDim());
        }
        LOG.info(String.format("Pull histogram from PS cost %d ms", System.currentTimeMillis() - pullStartTime));
        GradHistHelper histHelper = new GradHistHelper(this, nid);
        // 2.3. find best split result of this tree node
        if (this.param.isServerSplit) {
            // 2.3.1 using server split
            if (splitEntry.getFid() != -1) {
                int trueSplitFid = this.fSet[splitEntry.getFid()];
                int splitIdx = (int) splitEntry.getFvalue();
                float trueSplitValue = this.sketches[trueSplitFid * this.param.numSplit + splitIdx];
                LOG.info(String.format("Best split of node[%d]: feature[%d], value[%f], " + "true feature[%d], true value[%f], losschg[%f]", nid, splitEntry.getFid(), splitEntry.getFvalue(), trueSplitFid, trueSplitValue, splitEntry.getLossChg()));
                splitEntry.setFid(trueSplitFid);
                splitEntry.setFvalue(trueSplitValue);
            }
            // update the grad stats of the root node on PS, only called once by leader worker
            if (nid == 0) {
                GradStats rootStats = new GradStats(splitEntry.leftGradStat);
                rootStats.add(splitEntry.rightGradStat);
                this.updateNodeGradStats(nid, rootStats);
            }
            // update the grad stats of children node
            if (splitEntry.fid != -1) {
                // update the left child
                this.updateNodeGradStats(2 * nid + 1, splitEntry.leftGradStat);
                // update the right child
                this.updateNodeGradStats(2 * nid + 2, splitEntry.rightGradStat);
            }
            // 2.3.2 the updated split result (tree node/feature/value/gain) on PS,
            updatedIndices[i] = nid;
            updatedSplitFid[i] = splitEntry.fid;
            updatedSplitFvalue[i] = splitEntry.fvalue;
            updatedSplitGain[i] = splitEntry.lossChg;
        } else {
            // 2.3.3 otherwise, the returned histogram contains the gradient info
            splitEntry = histHelper.findBestSplit(histogram);
            LOG.info(String.format("Best split of node[%d]: feature[%d], value[%f], losschg[%f]", nid, splitEntry.getFid(), splitEntry.getFvalue(), splitEntry.getLossChg()));
            // 2.3.4 the updated split result (tree node/feature/value/gain) on PS,
            updatedIndices[i] = nid;
            updatedSplitFid[i] = splitEntry.fid;
            updatedSplitFvalue[i] = splitEntry.fvalue;
            updatedSplitGain[i] = splitEntry.lossChg;
        }
        // 2.3.5 reset this tree node's gradient histogram to 0
        histMat.zero();
    }
    // 3. push split feature to PS
    IntIntVector splitFeatureVector = new IntIntVector(this.activeNode.length, new IntIntDenseVectorStorage(this.activeNode.length));
    // 4. push split value to PS
    IntDoubleVector splitValueVector = new IntDoubleVector(this.activeNode.length, new IntDoubleDenseVectorStorage(this.activeNode.length));
    // 5. push split gain to PS
    IntDoubleVector splitGainVector = new IntDoubleVector(this.activeNode.length, new IntDoubleDenseVectorStorage(this.activeNode.length));
    for (int i = 0; i < updatedIndices.length; i++) {
        splitFeatureVector.set(updatedIndices[i], updatedSplitFid[i]);
        splitValueVector.set(updatedIndices[i], updatedSplitFvalue[i]);
        splitGainVector.set(updatedIndices[i], updatedSplitGain[i]);
    }
    PSModel splitFeat = model.getPSModel(this.param.splitFeaturesName);
    splitFeat.increment(this.currentTree, splitFeatureVector);
    PSModel splitValue = model.getPSModel(this.param.splitValuesName);
    splitValue.increment(this.currentTree, splitValueVector);
    PSModel splitGain = model.getPSModel(this.param.splitGainsName);
    splitGain.increment(this.currentTree, splitGainVector);
    // 6. set phase to AFTER_SPLIT
    // this.phase = GBDTPhase.AFTER_SPLIT;
    LOG.info(String.format("Find split cost: %d ms", System.currentTimeMillis() - startTime));
    // clock
    Set<String> needFlushMatrixSet = new HashSet<String>(3);
    needFlushMatrixSet.add(this.param.splitFeaturesName);
    needFlushMatrixSet.add(this.param.splitValuesName);
    needFlushMatrixSet.add(this.param.splitGainsName);
    needFlushMatrixSet.add(this.param.nodeGradStatsName);
    clockAllMatrix(needFlushMatrixSet, true);
}
Also used : PSModel(com.tencent.angel.ml.model.PSModel) SplitEntry(com.tencent.angel.ml.GBDT.algo.tree.SplitEntry) HistAggrParam(com.tencent.angel.ml.GBDT.psf.HistAggrParam) GBDTGradHistGetRowFunc(com.tencent.angel.ml.GBDT.psf.GBDTGradHistGetRowFunc) IntIntVector(com.tencent.angel.ml.math2.vector.IntIntVector) IntDoubleVector(com.tencent.angel.ml.math2.vector.IntDoubleVector) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) IntDoubleDenseVectorStorage(com.tencent.angel.ml.math2.storage.IntDoubleDenseVectorStorage) IntIntDenseVectorStorage(com.tencent.angel.ml.math2.storage.IntIntDenseVectorStorage)

Example 10 with SplitEntry

use of com.tencent.angel.ml.GBDT.algo.tree.SplitEntry in project angel by Tencent.

the class GradHistHelper method findBestSplitHelper.

// find the best split result of the histogram of a tree node
public static SplitEntry findBestSplitHelper(IntDoubleVector histogram) throws InterruptedException {
    LOG.debug(String.format("------To find the best split of histogram size[%d]------", histogram.getDim()));
    SplitEntry splitEntry = new SplitEntry();
    LOG.debug(String.format("The best split before looping the histogram: fid[%d], fvalue[%f]", splitEntry.fid, splitEntry.fvalue));
    int featureNum = WorkerContext.get().getConf().getInt(MLConf.ML_FEATURE_INDEX_RANGE(), MLConf.DEFAULT_ML_FEATURE_INDEX_RANGE());
    int splitNum = WorkerContext.get().getConf().getInt(MLConf.ML_GBDT_SPLIT_NUM(), MLConf.DEFAULT_ML_GBDT_SPLIT_NUM());
    if (histogram.getDim() != featureNum * 2 * splitNum) {
        LOG.debug("The size of histogram is not equal to 2 * featureNum*splitNum.");
        return splitEntry;
    }
    for (int fid = 0; fid < featureNum; fid++) {
        // 2.2. get the indexes of histogram of this feature
        int startIdx = 2 * splitNum * fid;
        // 2.3. find the best split of current feature
        SplitEntry curSplit = findBestSplitOfOneFeatureHelper(fid, histogram, startIdx);
        // 2.4. update the best split result if possible
        splitEntry.update(curSplit);
    }
    LOG.debug(String.format("The best split after looping the histogram: fid[%d], fvalue[%f], loss gain[%f]", splitEntry.fid, splitEntry.fvalue, splitEntry.lossChg));
    return splitEntry;
}
Also used : SplitEntry(com.tencent.angel.ml.GBDT.algo.tree.SplitEntry)

Aggregations

SplitEntry (com.tencent.angel.ml.GBDT.algo.tree.SplitEntry)19 GradStats (com.tencent.angel.ml.GBDT.algo.RegTree.GradStats)2 RegTNodeStat (com.tencent.angel.ml.GBDT.algo.RegTree.RegTNodeStat)2 TNode (com.tencent.angel.ml.GBDT.algo.tree.TNode)2 GBDTParam (com.tencent.angel.ml.GBDT.param.GBDTParam)2 ServerIntDoubleRow (com.tencent.angel.ps.storage.vector.ServerIntDoubleRow)2 GBDTGradHistGetRowFunc (com.tencent.angel.ml.GBDT.psf.GBDTGradHistGetRowFunc)1 HistAggrParam (com.tencent.angel.ml.GBDT.psf.HistAggrParam)1 IntDoubleDenseVectorStorage (com.tencent.angel.ml.math2.storage.IntDoubleDenseVectorStorage)1 IntIntDenseVectorStorage (com.tencent.angel.ml.math2.storage.IntIntDenseVectorStorage)1 IntDoubleVector (com.tencent.angel.ml.math2.vector.IntDoubleVector)1 IntIntVector (com.tencent.angel.ml.math2.vector.IntIntVector)1 PartitionGetRowResult (com.tencent.angel.ml.matrix.psf.get.getrow.PartitionGetRowResult)1 PSModel (com.tencent.angel.ml.model.PSModel)1 GBDTParam (com.tencent.angel.ml.param.GBDTParam)1 ServerRow (com.tencent.angel.ps.storage.vector.ServerRow)1 ArrayList (java.util.ArrayList)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1