Search in sources :

Example 1 with ConfusionMatrixObject

use of ml.shifu.shifu.container.ConfusionMatrixObject in project shifu by ShifuML.

the class PerformanceEvaluator method review.

public void review() throws IOException {
    PathFinder pathFinder = new PathFinder(modelConfig);
    log.info("Loading confusion matrix in {}", pathFinder.getEvalMatrixPath(evalConfig, evalConfig.getDataSet().getSource()));
    BufferedReader reader = ShifuFileUtils.getReader(pathFinder.getEvalMatrixPath(evalConfig, evalConfig.getDataSet().getSource()), evalConfig.getDataSet().getSource());
    String line = null;
    List<ConfusionMatrixObject> matrixList = new ArrayList<ConfusionMatrixObject>();
    int cnt = 0;
    while ((line = reader.readLine()) != null) {
        cnt++;
        String[] raw = line.split("\\|");
        ConfusionMatrixObject matrix = new ConfusionMatrixObject();
        matrix.setTp(Double.parseDouble(raw[0]));
        matrix.setFp(Double.parseDouble(raw[1]));
        matrix.setFn(Double.parseDouble(raw[2]));
        matrix.setTn(Double.parseDouble(raw[3]));
        matrix.setWeightedTp(Double.parseDouble(raw[4]));
        matrix.setWeightedFp(Double.parseDouble(raw[5]));
        matrix.setWeightedFn(Double.parseDouble(raw[6]));
        matrix.setWeightedTn(Double.parseDouble(raw[7]));
        matrix.setScore(Double.parseDouble(raw[8]));
        matrixList.add(matrix);
    }
    if (0 == cnt) {
        log.info("No result read, please check EvalConfusionMatrix file");
        throw new ShifuException(ShifuErrorCode.ERROR_EVALCONFMTR);
    }
    reader.close();
    review(matrixList, cnt);
}
Also used : BufferedReader(java.io.BufferedReader) ArrayList(java.util.ArrayList) PathFinder(ml.shifu.shifu.fs.PathFinder) ConfusionMatrixObject(ml.shifu.shifu.container.ConfusionMatrixObject) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 2 with ConfusionMatrixObject

use of ml.shifu.shifu.container.ConfusionMatrixObject in project shifu by ShifuML.

the class ConfusionMatrixCalculator method calculate.

public void calculate(BufferedWriter writer) {
    Double sumPos = 0.0, sumNeg = 0.0, sumWeightedPos = 0.0, sumWeightedNeg = 0.0;
    for (ModelResultObject mo : moList) {
        if (posTags.contains(mo.getTag())) {
            // Positive
            sumPos += posScaleFactor;
            sumWeightedPos += mo.getWeight() * posScaleFactor;
        } else {
            // Negative
            sumNeg += negScaleFactor;
            sumWeightedNeg += mo.getWeight() * negScaleFactor;
        }
    }
    ConfusionMatrixObject prevCmo = new ConfusionMatrixObject();
    prevCmo.setTp(0.0);
    prevCmo.setFp(0.0);
    prevCmo.setFn(sumPos);
    prevCmo.setTn(sumNeg);
    prevCmo.setWeightedTp(0.0);
    prevCmo.setWeightedFp(0.0);
    prevCmo.setWeightedFn(sumWeightedPos);
    prevCmo.setWeightedTn(sumWeightedNeg);
    prevCmo.setScore(1000);
    saveConfusionMaxtrixWithWriter(writer, prevCmo);
    for (ModelResultObject mo : moList) {
        ConfusionMatrixObject cmo = new ConfusionMatrixObject(prevCmo);
        if (posTags.contains(mo.getTag())) {
            // Positive Instance
            cmo.setTp(cmo.getTp() + posScaleFactor);
            cmo.setFn(cmo.getFn() - posScaleFactor);
            cmo.setWeightedTp(cmo.getWeightedTp() + mo.getWeight() * posScaleFactor);
            cmo.setWeightedFn(cmo.getWeightedFn() - mo.getWeight() * posScaleFactor);
        } else {
            // Negative Instance
            cmo.setFp(cmo.getFp() + negScaleFactor);
            cmo.setTn(cmo.getTn() - negScaleFactor);
            cmo.setWeightedFp(cmo.getWeightedFp() + mo.getWeight() * negScaleFactor);
            cmo.setWeightedTn(cmo.getWeightedTn() - mo.getWeight() * negScaleFactor);
        }
        cmo.setScore(mo.getScore());
        saveConfusionMaxtrixWithWriter(writer, cmo);
        prevCmo = cmo;
    }
}
Also used : ModelResultObject(ml.shifu.shifu.container.ModelResultObject) ConfusionMatrixObject(ml.shifu.shifu.container.ConfusionMatrixObject)

Example 3 with ConfusionMatrixObject

use of ml.shifu.shifu.container.ConfusionMatrixObject in project shifu by ShifuML.

the class ConfusionMatrixCalculator method calculate.

public List<ConfusionMatrixObject> calculate() {
    List<ConfusionMatrixObject> cmoList = new ArrayList<ConfusionMatrixObject>();
    // Calculate the sum
    Double sumPos = 0.0, sumNeg = 0.0, sumWeightedPos = 0.0, sumWeightedNeg = 0.0;
    for (ModelResultObject mo : moList) {
        if (posTags.contains(mo.getTag())) {
            // Positive
            sumPos += posScaleFactor;
            sumWeightedPos += mo.getWeight() * posScaleFactor;
        } else {
            // Negative
            sumNeg += negScaleFactor;
            sumWeightedNeg += mo.getWeight() * negScaleFactor;
        }
    }
    // init ConfusionMatrix
    ConfusionMatrixObject initCmo = new ConfusionMatrixObject();
    initCmo.setTp(0.0);
    initCmo.setFp(0.0);
    initCmo.setFn(sumPos);
    initCmo.setTn(sumNeg);
    initCmo.setWeightedTp(0.0);
    initCmo.setWeightedFp(0.0);
    initCmo.setWeightedFn(sumWeightedPos);
    initCmo.setWeightedTn(sumWeightedNeg);
    initCmo.setScore(moList.get(0).getScore());
    cmoList.add(initCmo);
    // Calculate the rest
    ConfusionMatrixObject prevCmo = initCmo;
    for (ModelResultObject mo : moList) {
        ConfusionMatrixObject cmo = new ConfusionMatrixObject(prevCmo);
        if (posTags.contains(mo.getTag())) {
            // Positive Instance
            cmo.setTp(cmo.getTp() + posScaleFactor);
            cmo.setFn(cmo.getFn() - posScaleFactor);
            cmo.setWeightedTp(cmo.getWeightedTp() + mo.getWeight() * posScaleFactor);
            cmo.setWeightedFn(cmo.getWeightedFn() - mo.getWeight() * posScaleFactor);
        } else {
            // Negative Instance
            cmo.setFp(cmo.getFp() + negScaleFactor);
            cmo.setTn(cmo.getTn() - negScaleFactor);
            cmo.setWeightedFp(cmo.getWeightedFp() + mo.getWeight() * negScaleFactor);
            cmo.setWeightedTn(cmo.getWeightedTn() - mo.getWeight() * negScaleFactor);
        }
        cmo.setScore(mo.getScore());
        cmoList.add(cmo);
        prevCmo = cmo;
    }
    return cmoList;
}
Also used : ModelResultObject(ml.shifu.shifu.container.ModelResultObject) ArrayList(java.util.ArrayList) ConfusionMatrixObject(ml.shifu.shifu.container.ConfusionMatrixObject)

Example 4 with ConfusionMatrixObject

use of ml.shifu.shifu.container.ConfusionMatrixObject in project shifu by ShifuML.

the class ConfusionMatrix method buildInitalCmo.

private ConfusionMatrixObject buildInitalCmo(long pigPosTags, long pigNegTags, double pigPosWeightTags, double pigNegWeightTags, double maxScore) {
    ConfusionMatrixObject prevCmo = new ConfusionMatrixObject();
    prevCmo.setTp(0.0);
    prevCmo.setFp(0.0);
    prevCmo.setFn(pigPosTags);
    prevCmo.setTn(pigNegTags);
    prevCmo.setWeightedTp(0.0);
    prevCmo.setWeightedFp(0.0);
    prevCmo.setWeightedFn(pigPosWeightTags);
    prevCmo.setWeightedTn(pigNegWeightTags);
    prevCmo.setScore(maxScore);
    return prevCmo;
}
Also used : ConfusionMatrixObject(ml.shifu.shifu.container.ConfusionMatrixObject)

Example 5 with ConfusionMatrixObject

use of ml.shifu.shifu.container.ConfusionMatrixObject in project shifu by ShifuML.

the class ConfusionMatrix method bufferedComputeConfusionMatrixAndPerformance.

public PerformanceResult bufferedComputeConfusionMatrixAndPerformance(long pigPosTags, long pigNegTags, double pigPosWeightTags, double pigNegWeightTags, long records, double maxPScore, double minPScore, String scoreDataPath, String evalPerformancePath, boolean isPrint, boolean isGenerateChart, int targetColumnIndex, int scoreColumnIndex, int weightColumnIndex, boolean isUseMaxMinScore) throws IOException {
    // 1. compute maxScore and minScore in case some cases score are not in [0, 1]
    double maxScore = 1d * scoreScale, minScore = 0d;
    if (isGBTNeedConvertScore()) {
    // if need convert to [0, 1], just keep max score to 1 and min score to 0 without doing anything
    } else {
        if (isUseMaxMinScore) {
            // TODO some cases maxPScore is already scaled, how to fix that issue
            maxScore = maxPScore;
            minScore = minPScore;
        } else {
        // otherwise, keep [0, 1]
        }
    }
    LOG.info("{} Transformed (scale included) max score is {}, transformed min score is {}", evalConfig.getGbtScoreConvertStrategy(), maxScore, minScore);
    SourceType sourceType = evalConfig.getDataSet().getSource();
    List<Scanner> scanners = ShifuFileUtils.getDataScanners(scoreDataPath, sourceType);
    LOG.info("Number of score files is {} in eval {}.", scanners.size(), evalConfig.getName());
    int numBucket = evalConfig.getPerformanceBucketNum();
    boolean hasWeight = StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName());
    boolean isDir = ShifuFileUtils.isDir(pathFinder.getEvalScorePath(evalConfig, sourceType), sourceType);
    List<PerformanceObject> FPRList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> catchRateList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> gainList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> modelScoreList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> FPRWeightList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> catchRateWeightList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> gainWeightList = new ArrayList<PerformanceObject>(numBucket + 1);
    double binScore = (maxScore - minScore) * 1d / numBucket, binCapacity = 1.0 / numBucket, scoreBinCount = 0, scoreBinWeigthedCount = 0;
    int fpBin = 1, tpBin = 1, gainBin = 1, fpWeightBin = 1, tpWeightBin = 1, gainWeightBin = 1, modelScoreBin = 1;
    long index = 0, cnt = 0, invalidTargetCnt = 0, invalidWgtCnt = 0;
    ConfusionMatrixObject prevCmo = buildInitalCmo(pigPosTags, pigNegTags, pigPosWeightTags, pigNegWeightTags, maxScore);
    PerformanceObject po = buildFirstPO(prevCmo);
    FPRList.add(po);
    catchRateList.add(po);
    gainList.add(po);
    FPRWeightList.add(po);
    catchRateWeightList.add(po);
    gainWeightList.add(po);
    modelScoreList.add(po);
    boolean isGBTScoreHalfCutoffStreategy = isGBTScoreHalfCutoffStreategy();
    boolean isGBTScoreMaxMinScaleStreategy = isGBTScoreMaxMinScaleStreategy();
    Splitter splitter = Splitter.on(delimiter).trimResults();
    for (Scanner scanner : scanners) {
        while (scanner.hasNext()) {
            if ((++cnt) % 100000L == 0L) {
                LOG.info("Loaded {} records.", cnt);
            }
            if ((!isDir) && cnt == 1) {
                // if the evaluation score file is the local file, skip the first line since we add
                continue;
            }
            // score is separated by default delimiter in our pig output format
            String[] raw = Lists.newArrayList(splitter.split(scanner.nextLine())).toArray(new String[0]);
            // tag check
            String tag = raw[targetColumnIndex];
            if (StringUtils.isBlank(tag) || (!posTags.contains(tag) && !negTags.contains(tag))) {
                invalidTargetCnt += 1;
                continue;
            }
            double weight = 1d;
            // if has weight
            if (weightColumnIndex > 0) {
                try {
                    weight = Double.parseDouble(raw[weightColumnIndex]);
                } catch (NumberFormatException e) {
                    invalidWgtCnt += 1;
                }
                if (weight < 0d) {
                    invalidWgtCnt += 1;
                    weight = 1d;
                }
            }
            double score = 0.0;
            try {
                score = Double.parseDouble(raw[scoreColumnIndex]);
            } catch (NumberFormatException e) {
                // user set the score column wrong ?
                if (Math.random() < 0.05) {
                    LOG.warn("The score column - {} is not number. Is score column set correctly?", raw[scoreColumnIndex]);
                }
                continue;
            }
            scoreBinCount += 1;
            scoreBinWeigthedCount += weight;
            ConfusionMatrixObject cmo = new ConfusionMatrixObject(prevCmo);
            if (posTags.contains(tag)) {
                // Positive Instance
                cmo.setTp(cmo.getTp() + 1);
                cmo.setFn(cmo.getFn() - 1);
                cmo.setWeightedTp(cmo.getWeightedTp() + weight * 1.0);
                cmo.setWeightedFn(cmo.getWeightedFn() - weight * 1.0);
            } else {
                // Negative Instance
                cmo.setFp(cmo.getFp() + 1);
                cmo.setTn(cmo.getTn() - 1);
                cmo.setWeightedFp(cmo.getWeightedFp() + weight * 1.0);
                cmo.setWeightedTn(cmo.getWeightedTn() - weight * 1.0);
            }
            if (isGBTScoreHalfCutoffStreategy) {
                // use max min scale to rescale to [0, 1]
                if (score < 0d) {
                    score = 0d;
                }
                score = ((score - 0) * scoreScale) / (maxPScore - 0);
            } else if (isGBTScoreMaxMinScaleStreategy) {
                // use max min scaler to make score in [0, 1], don't foget to time scoreScale
                score = ((score - minPScore) * scoreScale) / (maxPScore - minPScore);
            } else {
            // do nothing, use current score
            }
            cmo.setScore(Double.parseDouble(SCORE_FORMAT.format(score)));
            ConfusionMatrixObject object = cmo;
            po = PerformanceEvaluator.setPerformanceObject(object);
            if (po.fpr >= fpBin * binCapacity) {
                po.binNum = fpBin++;
                FPRList.add(po);
            }
            if (po.recall >= tpBin * binCapacity) {
                po.binNum = tpBin++;
                catchRateList.add(po);
            }
            // prevent 99%
            double validRecordCnt = (double) (index + 1);
            if (validRecordCnt / (pigPosTags + pigNegTags) >= gainBin * binCapacity) {
                po.binNum = gainBin++;
                gainList.add(po);
            }
            if (po.weightedFpr >= fpWeightBin * binCapacity) {
                po.binNum = fpWeightBin++;
                FPRWeightList.add(po);
            }
            if (po.weightedRecall >= tpWeightBin * binCapacity) {
                po.binNum = tpWeightBin++;
                catchRateWeightList.add(po);
            }
            if ((object.getWeightedTp() + object.getWeightedFp()) / object.getWeightedTotal() >= gainWeightBin * binCapacity) {
                po.binNum = gainWeightBin++;
                gainWeightList.add(po);
            }
            if ((maxScore - (modelScoreBin * binScore)) >= score) {
                po.binNum = modelScoreBin++;
                po.scoreCount = scoreBinCount;
                po.scoreWgtCount = scoreBinWeigthedCount;
                // System.out.println("score count is " + scoreBinCount);
                // reset to 0 for next bin score cnt stats
                scoreBinCount = scoreBinWeigthedCount = 0;
                modelScoreList.add(po);
            }
            index += 1;
            prevCmo = cmo;
        }
        scanner.close();
    }
    LOG.info("Totally loading {} records with invalid target records {} and invalid weight records {} in eval {}.", cnt, invalidTargetCnt, invalidWgtCnt, evalConfig.getName());
    PerformanceResult result = buildPerfResult(FPRList, catchRateList, gainList, modelScoreList, FPRWeightList, catchRateWeightList, gainWeightList);
    synchronized (this.lock) {
        if (isPrint) {
            PerformanceEvaluator.logResult(FPRList, "Bucketing False Positive Rate");
            if (hasWeight) {
                PerformanceEvaluator.logResult(FPRWeightList, "Bucketing Weighted False Positive Rate");
            }
            PerformanceEvaluator.logResult(catchRateList, "Bucketing Catch Rate");
            if (hasWeight) {
                PerformanceEvaluator.logResult(catchRateWeightList, "Bucketing Weighted Catch Rate");
            }
            PerformanceEvaluator.logResult(gainList, "Bucketing Action Rate");
            if (hasWeight) {
                PerformanceEvaluator.logResult(gainWeightList, "Bucketing Weighted Action Rate");
            }
            PerformanceEvaluator.logAucResult(result, hasWeight);
        }
        writePerResult2File(evalPerformancePath, result);
        if (isGenerateChart) {
            generateChartAndJsonPerfFiles(hasWeight, result);
        }
    }
    if (cnt == 0) {
        LOG.error("No score read, the EvalScore did not genernate or is null file");
        throw new ShifuException(ShifuErrorCode.ERROR_EVALSCORE);
    }
    return result;
}
Also used : Scanner(java.util.Scanner) Splitter(com.google.common.base.Splitter) PerformanceObject(ml.shifu.shifu.container.PerformanceObject) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) ArrayList(java.util.ArrayList) ConfusionMatrixObject(ml.shifu.shifu.container.ConfusionMatrixObject) PerformanceResult(ml.shifu.shifu.container.obj.PerformanceResult) ShifuException(ml.shifu.shifu.exception.ShifuException)

Aggregations

ConfusionMatrixObject (ml.shifu.shifu.container.ConfusionMatrixObject)6 ArrayList (java.util.ArrayList)4 ModelResultObject (ml.shifu.shifu.container.ModelResultObject)2 PerformanceObject (ml.shifu.shifu.container.PerformanceObject)2 PerformanceResult (ml.shifu.shifu.container.obj.PerformanceResult)2 ShifuException (ml.shifu.shifu.exception.ShifuException)2 Splitter (com.google.common.base.Splitter)1 BufferedReader (java.io.BufferedReader)1 Scanner (java.util.Scanner)1 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)1 PathFinder (ml.shifu.shifu.fs.PathFinder)1