Search in sources :

Example 1 with GBDTParam

use of com.tencent.angel.ml.GBDT.param.GBDTParam in project angel by Tencent.

the class GradHistHelper method findBestSplitOfOneFeatureHelper.

// find the best split result of one feature
public static SplitEntry findBestSplitOfOneFeatureHelper(int fid, IntDoubleVector histogram, int startIdx) {
    LOG.debug(String.format("Find best split for fid[%d] in histogram size[%d], startIdx[%d]", fid, histogram.getDim(), startIdx));
    int splitNum = WorkerContext.get().getConf().getInt(MLConf.ML_GBDT_SPLIT_NUM(), MLConf.DEFAULT_ML_GBDT_SPLIT_NUM());
    SplitEntry splitEntry = new SplitEntry();
    // 1. set the feature id
    // splitEntry.setFid(fid);
    // 2. create the best left stats and right stats
    GradStats bestLeftStat = new GradStats();
    GradStats bestRightStat = new GradStats();
    GradStats rootStats = calGradStats(histogram, startIdx, splitNum);
    GBDTParam param = new GBDTParam();
    if (startIdx + 2 * splitNum <= histogram.getDim()) {
        // 3. the gain of the root node
        float rootGain = rootStats.calcGain(param);
        LOG.debug(String.format("Feature[%d]: sumGrad[%f], sumHess[%f], gain[%f]", fid, rootStats.sumGrad, rootStats.sumHess, rootGain));
        // 4. create the temp left and right grad stats
        GradStats leftStats = new GradStats();
        GradStats rightStats = new GradStats();
        // 5. loop over all the data in histogram
        for (int histIdx = startIdx; histIdx < startIdx + splitNum - 1; histIdx++) {
            // 5.1. get the grad and hess of current hist bin
            float grad = (float) histogram.get(histIdx);
            float hess = (float) histogram.get(splitNum + histIdx);
            leftStats.add(grad, hess);
            // 5.2. check whether we can split with current left hessian
            if (leftStats.sumHess >= param.minChildWeight) {
                // right = root - left
                rightStats.setSubstract(rootStats, leftStats);
                // 5.3. check whether we can split with current right hessian
                if (rightStats.sumHess >= param.minChildWeight) {
                    // 5.4. calculate the current loss gain
                    float lossChg = leftStats.calcGain(param) + rightStats.calcGain(param) - rootGain;
                    // 5.5. check whether we should update the split result with current loss gain
                    int splitIdx = histIdx - startIdx + 1;
                    if (splitEntry.update(lossChg, fid, splitIdx)) {
                        // 5.6. if should update, also update the best left and right grad stats
                        bestLeftStat.update(leftStats.sumGrad, leftStats.sumHess);
                        bestRightStat.update(rightStats.sumGrad, rightStats.sumHess);
                    }
                }
            }
        }
        // 6. set the best left and right grad stats
        splitEntry.leftGradStat = bestLeftStat;
        splitEntry.rightGradStat = bestRightStat;
        LOG.debug(String.format("Find best split for fid[%d], split feature[%d]: split index[%f], lossChg[%f]", fid, splitEntry.fid, splitEntry.fvalue, splitEntry.lossChg));
    } else {
        LOG.error("index out of grad histogram size.");
    }
    return splitEntry;
}
Also used : SplitEntry(com.tencent.angel.ml.GBDT.algo.tree.SplitEntry) GBDTParam(com.tencent.angel.ml.GBDT.param.GBDTParam)

Example 2 with GBDTParam

use of com.tencent.angel.ml.GBDT.param.GBDTParam in project angel by Tencent.

the class GBDTGradHistGetRowFunc method partitionGet.

@Override
public PartitionGetResult partitionGet(PartitionGetParam partParam) {
    HistAggrParam.HistPartitionAggrParam param = (HistAggrParam.HistPartitionAggrParam) partParam;
    LOG.info("For the gradient histogram of GBDT, we use PS to find the optimal split");
    GBDTParam gbtparam = new GBDTParam();
    gbtparam.numSplit = param.getSplitNum();
    gbtparam.minChildWeight = param.getMinChildWeight();
    gbtparam.regAlpha = param.getRegAlpha();
    gbtparam.regLambda = param.getRegLambda();
    ServerIntDoubleRow row = (ServerIntDoubleRow) psContext.getMatrixStorageManager().getRow(param.getMatrixId(), param.getRowId(), param.getPartKey().getPartitionId());
    SplitEntry splitEntry = GradHistHelper.findSplitOfServerRow(row, gbtparam);
    int fid = splitEntry.getFid();
    int splitIndex = (int) splitEntry.getFvalue();
    double lossGain = splitEntry.getLossChg();
    GradStats leftGradStat = splitEntry.leftGradStat;
    GradStats rightGradStat = splitEntry.rightGradStat;
    double leftSumGrad = leftGradStat.sumGrad;
    double leftSumHess = leftGradStat.sumHess;
    double rightSumGrad = rightGradStat.sumGrad;
    double rightSumHess = rightGradStat.sumHess;
    LOG.info(String.format("split of matrix[%d] part[%d] row[%d]: fid[%d], split index[%d], loss gain[%f], " + "left sumGrad[%f], left sum hess[%f], right sumGrad[%f], right sum hess[%f]", param.getMatrixId(), param.getPartKey().getPartitionId(), param.getRowId(), fid, splitIndex, lossGain, leftSumGrad, leftSumHess, rightSumGrad, rightSumHess));
    int startFid = (int) row.getStartCol() / (2 * gbtparam.numSplit);
    // each split contains 7 doubles
    int sendStartCol = startFid * 7;
    // int sendStartCol = (int) row.getStartCol();
    int sendEndCol = sendStartCol + 7;
    ServerIntDoubleRow sendRow = new ServerIntDoubleRow(param.getRowId(), RowType.T_DOUBLE_DENSE, sendStartCol, sendEndCol, sendEndCol - sendStartCol, RouterType.RANGE);
    LOG.info(String.format("Create server row of split result: row id[%d], start col[%d], end col[%d]", param.getRowId(), sendStartCol, sendEndCol));
    sendRow.set(0 + sendStartCol, fid);
    sendRow.set(1 + sendStartCol, splitIndex);
    sendRow.set(2 + sendStartCol, lossGain);
    sendRow.set(3 + sendStartCol, leftSumGrad);
    sendRow.set(4 + sendStartCol, leftSumHess);
    sendRow.set(5 + sendStartCol, rightSumGrad);
    sendRow.set(6 + sendStartCol, rightSumHess);
    return new PartitionGetRowResult(sendRow);
}
Also used : ServerIntDoubleRow(com.tencent.angel.ps.storage.vector.ServerIntDoubleRow) SplitEntry(com.tencent.angel.ml.GBDT.algo.tree.SplitEntry) GradStats(com.tencent.angel.ml.GBDT.algo.RegTree.GradStats) PartitionGetRowResult(com.tencent.angel.ml.matrix.psf.get.getrow.PartitionGetRowResult) GBDTParam(com.tencent.angel.ml.GBDT.param.GBDTParam)

Aggregations

SplitEntry (com.tencent.angel.ml.GBDT.algo.tree.SplitEntry)2 GBDTParam (com.tencent.angel.ml.GBDT.param.GBDTParam)2 GradStats (com.tencent.angel.ml.GBDT.algo.RegTree.GradStats)1 PartitionGetRowResult (com.tencent.angel.ml.matrix.psf.get.getrow.PartitionGetRowResult)1 ServerIntDoubleRow (com.tencent.angel.ps.storage.vector.ServerIntDoubleRow)1