Search in sources :

Example 36 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class MapReducerStatsWorker method doStats.

@Override
public boolean doStats() throws Exception {
    log.info("delete historical pre-train data");
    ShifuFileUtils.deleteFile(pathFinder.getPreTrainingStatsPath(), modelConfig.getDataSet().getSource());
    Map<String, String> paramsMap = new HashMap<String, String>();
    paramsMap.put("delimiter", CommonUtils.escapePigString(modelConfig.getDataSetDelimiter()));
    int columnParallel = 0;
    if (columnConfigList.size() <= 100) {
        columnParallel = columnConfigList.size() * 2;
    } else if (columnConfigList.size() <= 500) {
        columnParallel = columnConfigList.size();
    } else if (columnConfigList.size() <= 1000) {
        // 1000 => 200 reducers
        columnParallel = columnConfigList.size() / 2;
    } else if (columnConfigList.size() > 1000 && columnConfigList.size() <= 2000) {
        // 2000 => 320 reducers
        columnParallel = columnConfigList.size() / 4;
    } else if (columnConfigList.size() > 2000 && columnConfigList.size() <= 3000) {
        // 3000 => 420 reducers
        columnParallel = columnConfigList.size() / 6;
    } else if (columnConfigList.size() > 3000 && columnConfigList.size() <= 4000) {
        // 4000 => 500
        columnParallel = columnConfigList.size() / 8;
    } else {
        // 5000 => 500
        columnParallel = columnConfigList.size() / 10;
    }
    // limit max reducer to 999
    int parallelNumbByVolume = getParallelNumByDataVolume();
    if (columnParallel < parallelNumbByVolume) {
        columnParallel = parallelNumbByVolume;
        log.info("Adjust parallel number to {} according data volume", columnParallel);
    }
    columnParallel = columnParallel > 999 ? 999 : columnParallel;
    paramsMap.put("column_parallel", Integer.toString(columnParallel));
    paramsMap.put("histo_scale_factor", Environment.getProperty("shifu.stats.histo.scale.factor", "100"));
    try {
        runStatsPig(paramsMap);
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
    } catch (Throwable e) {
        throw new RuntimeException(e);
    }
    // sync Down
    log.info("Updating ColumnConfig with stats...");
    // update column config
    updateColumnConfigWithPreTrainingStats();
    // check categorical columns and numerical columns and warning
    checkNumericalAndCategoricalColumns();
    // save it to local/hdfs
    processor.saveColumnConfigList();
    processor.syncDataToHdfs(modelConfig.getDataSet().getSource());
    boolean toRunPSIWithStats = Environment.getBoolean("shifu.stats.psi.together", true);
    if (toRunPSIWithStats && StringUtils.isNotEmpty(modelConfig.getPsiColumnName())) {
        runPSI();
        processor.saveColumnConfigList();
        processor.syncDataToHdfs(modelConfig.getDataSet().getSource());
    }
    return true;
}
Also used : HashMap(java.util.HashMap) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 37 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class ModelSpecLoaderUtils method loadModel.

/**
 * Loading model according to existing model path.
 *
 * @param modelConfig
 *            model config
 * @param modelPath
 *            the path to store model
 * @param fs
 *            file system used to store model
 * @param gbtConvertToProb
 *            convert gbt score to prob or not
 * @param gbtScoreConvertStrategy
 *            specify how to convert gbt raw score
 * @return model object or null if no modelPath file,
 * @throws IOException
 *             if loading file for any IOException
 */
public static BasicML loadModel(ModelConfig modelConfig, Path modelPath, FileSystem fs, boolean gbtConvertToProb, String gbtScoreConvertStrategy) throws IOException {
    if (!fs.exists(modelPath)) {
        // no such existing model, return null.
        return null;
    }
    // we have to register PersistBasicFloatNetwork for loading such models
    PersistorRegistry.getInstance().add(new PersistBasicFloatNetwork());
    FSDataInputStream stream = null;
    BufferedReader br = null;
    try {
        stream = fs.open(modelPath);
        if (modelPath.getName().endsWith(LogisticRegressionContants.LR_ALG_NAME.toLowerCase())) {
            // LR model
            br = new BufferedReader(new InputStreamReader(stream));
            try {
                return LR.loadFromString(br.readLine());
            } catch (Exception e) {
                // local LR model?
                // close and reopen
                IOUtils.closeQuietly(br);
                stream = fs.open(modelPath);
                return BasicML.class.cast(EncogDirectoryPersistence.loadObject(stream));
            }
        } else if (// RF or GBT
        modelPath.getName().endsWith(CommonConstants.RF_ALG_NAME.toLowerCase()) || modelPath.getName().endsWith(CommonConstants.GBT_ALG_NAME.toLowerCase())) {
            return TreeModel.loadFromStream(stream, gbtConvertToProb, gbtScoreConvertStrategy);
        } else {
            GzipStreamPair pair = GzipStreamPair.isGZipFormat(stream);
            if (pair.isGzip()) {
                return BasicML.class.cast(NNModel.loadFromStream(pair.getInput()));
            } else {
                return BasicML.class.cast(EncogDirectoryPersistence.loadObject(pair.getInput()));
            }
        }
    } catch (Exception e) {
        String msg = "the expecting model file is: " + modelPath;
        throw new ShifuException(ShifuErrorCode.ERROR_FAIL_TO_LOAD_MODEL_FILE, e, msg);
    } finally {
        IOUtils.closeQuietly(br);
        IOUtils.closeQuietly(stream);
    }
}
Also used : FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) BasicML(org.encog.ml.BasicML) ShifuException(ml.shifu.shifu.exception.ShifuException) PersistBasicFloatNetwork(ml.shifu.shifu.core.dtrain.dataset.PersistBasicFloatNetwork) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 38 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class HDFSUtils method renewFS.

/*
     * Sometimes FileSystem will be close in NodeManger while no reason about that so far. Here we add a renew method to
     * create a new FileSystem instance. This should be package level but ShifuFileUtils is not in the same package.
     * 
     * @see ShifuFileUtils#getReader(String, ml.shifu.shifu.container.obj.RawSourceData.SourceType)
     */
public static FileSystem renewFS() {
    synchronized (HDFSUtils.class) {
        try {
            // initialization
            // Assign to the hdfs instance after the tmpHdfs instance initialization fully complete.
            // Avoid hdfs instance being used before fully initializaion.
            FileSystem tmpHdfs = FileSystem.get(conf);
            tmpHdfs.setVerifyChecksum(false);
            hdfs = tmpHdfs;
        } catch (IOException e) {
            LOG.error("Error on creating hdfs FileSystem object.", e);
            throw new ShifuException(ShifuErrorCode.ERROR_GET_HDFS_SYSTEM);
        }
    }
    return hdfs;
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 39 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class ConfusionMatrix method bufferedComputeConfusionMatrixAndPerformance.

public PerformanceResult bufferedComputeConfusionMatrixAndPerformance(long pigPosTags, long pigNegTags, double pigPosWeightTags, double pigNegWeightTags, long records, double maxPScore, double minPScore, String scoreDataPath, String evalPerformancePath, boolean isPrint, boolean isGenerateChart, int targetColumnIndex, int scoreColumnIndex, int weightColumnIndex, boolean isUseMaxMinScore) throws IOException {
    // 1. compute maxScore and minScore in case some cases score are not in [0, 1]
    double maxScore = 1d * scoreScale, minScore = 0d;
    if (isGBTNeedConvertScore()) {
    // if need convert to [0, 1], just keep max score to 1 and min score to 0 without doing anything
    } else {
        if (isUseMaxMinScore) {
            // TODO some cases maxPScore is already scaled, how to fix that issue
            maxScore = maxPScore;
            minScore = minPScore;
        } else {
        // otherwise, keep [0, 1]
        }
    }
    LOG.info("{} Transformed (scale included) max score is {}, transformed min score is {}", evalConfig.getGbtScoreConvertStrategy(), maxScore, minScore);
    SourceType sourceType = evalConfig.getDataSet().getSource();
    List<Scanner> scanners = ShifuFileUtils.getDataScanners(scoreDataPath, sourceType);
    LOG.info("Number of score files is {} in eval {}.", scanners.size(), evalConfig.getName());
    int numBucket = evalConfig.getPerformanceBucketNum();
    boolean hasWeight = StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName());
    boolean isDir = ShifuFileUtils.isDir(pathFinder.getEvalScorePath(evalConfig, sourceType), sourceType);
    List<PerformanceObject> FPRList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> catchRateList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> gainList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> modelScoreList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> FPRWeightList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> catchRateWeightList = new ArrayList<PerformanceObject>(numBucket + 1);
    List<PerformanceObject> gainWeightList = new ArrayList<PerformanceObject>(numBucket + 1);
    double binScore = (maxScore - minScore) * 1d / numBucket, binCapacity = 1.0 / numBucket, scoreBinCount = 0, scoreBinWeigthedCount = 0;
    int fpBin = 1, tpBin = 1, gainBin = 1, fpWeightBin = 1, tpWeightBin = 1, gainWeightBin = 1, modelScoreBin = 1;
    long index = 0, cnt = 0, invalidTargetCnt = 0, invalidWgtCnt = 0;
    ConfusionMatrixObject prevCmo = buildInitalCmo(pigPosTags, pigNegTags, pigPosWeightTags, pigNegWeightTags, maxScore);
    PerformanceObject po = buildFirstPO(prevCmo);
    FPRList.add(po);
    catchRateList.add(po);
    gainList.add(po);
    FPRWeightList.add(po);
    catchRateWeightList.add(po);
    gainWeightList.add(po);
    modelScoreList.add(po);
    boolean isGBTScoreHalfCutoffStreategy = isGBTScoreHalfCutoffStreategy();
    boolean isGBTScoreMaxMinScaleStreategy = isGBTScoreMaxMinScaleStreategy();
    Splitter splitter = Splitter.on(delimiter).trimResults();
    for (Scanner scanner : scanners) {
        while (scanner.hasNext()) {
            if ((++cnt) % 100000L == 0L) {
                LOG.info("Loaded {} records.", cnt);
            }
            if ((!isDir) && cnt == 1) {
                // if the evaluation score file is the local file, skip the first line since we add
                continue;
            }
            // score is separated by default delimiter in our pig output format
            String[] raw = Lists.newArrayList(splitter.split(scanner.nextLine())).toArray(new String[0]);
            // tag check
            String tag = raw[targetColumnIndex];
            if (StringUtils.isBlank(tag) || (!posTags.contains(tag) && !negTags.contains(tag))) {
                invalidTargetCnt += 1;
                continue;
            }
            double weight = 1d;
            // if has weight
            if (weightColumnIndex > 0) {
                try {
                    weight = Double.parseDouble(raw[weightColumnIndex]);
                } catch (NumberFormatException e) {
                    invalidWgtCnt += 1;
                }
                if (weight < 0d) {
                    invalidWgtCnt += 1;
                    weight = 1d;
                }
            }
            double score = 0.0;
            try {
                score = Double.parseDouble(raw[scoreColumnIndex]);
            } catch (NumberFormatException e) {
                // user set the score column wrong ?
                if (Math.random() < 0.05) {
                    LOG.warn("The score column - {} is not number. Is score column set correctly?", raw[scoreColumnIndex]);
                }
                continue;
            }
            scoreBinCount += 1;
            scoreBinWeigthedCount += weight;
            ConfusionMatrixObject cmo = new ConfusionMatrixObject(prevCmo);
            if (posTags.contains(tag)) {
                // Positive Instance
                cmo.setTp(cmo.getTp() + 1);
                cmo.setFn(cmo.getFn() - 1);
                cmo.setWeightedTp(cmo.getWeightedTp() + weight * 1.0);
                cmo.setWeightedFn(cmo.getWeightedFn() - weight * 1.0);
            } else {
                // Negative Instance
                cmo.setFp(cmo.getFp() + 1);
                cmo.setTn(cmo.getTn() - 1);
                cmo.setWeightedFp(cmo.getWeightedFp() + weight * 1.0);
                cmo.setWeightedTn(cmo.getWeightedTn() - weight * 1.0);
            }
            if (isGBTScoreHalfCutoffStreategy) {
                // use max min scale to rescale to [0, 1]
                if (score < 0d) {
                    score = 0d;
                }
                score = ((score - 0) * scoreScale) / (maxPScore - 0);
            } else if (isGBTScoreMaxMinScaleStreategy) {
                // use max min scaler to make score in [0, 1], don't foget to time scoreScale
                score = ((score - minPScore) * scoreScale) / (maxPScore - minPScore);
            } else {
            // do nothing, use current score
            }
            cmo.setScore(Double.parseDouble(SCORE_FORMAT.format(score)));
            ConfusionMatrixObject object = cmo;
            po = PerformanceEvaluator.setPerformanceObject(object);
            if (po.fpr >= fpBin * binCapacity) {
                po.binNum = fpBin++;
                FPRList.add(po);
            }
            if (po.recall >= tpBin * binCapacity) {
                po.binNum = tpBin++;
                catchRateList.add(po);
            }
            // prevent 99%
            double validRecordCnt = (double) (index + 1);
            if (validRecordCnt / (pigPosTags + pigNegTags) >= gainBin * binCapacity) {
                po.binNum = gainBin++;
                gainList.add(po);
            }
            if (po.weightedFpr >= fpWeightBin * binCapacity) {
                po.binNum = fpWeightBin++;
                FPRWeightList.add(po);
            }
            if (po.weightedRecall >= tpWeightBin * binCapacity) {
                po.binNum = tpWeightBin++;
                catchRateWeightList.add(po);
            }
            if ((object.getWeightedTp() + object.getWeightedFp()) / object.getWeightedTotal() >= gainWeightBin * binCapacity) {
                po.binNum = gainWeightBin++;
                gainWeightList.add(po);
            }
            if ((maxScore - (modelScoreBin * binScore)) >= score) {
                po.binNum = modelScoreBin++;
                po.scoreCount = scoreBinCount;
                po.scoreWgtCount = scoreBinWeigthedCount;
                // System.out.println("score count is " + scoreBinCount);
                // reset to 0 for next bin score cnt stats
                scoreBinCount = scoreBinWeigthedCount = 0;
                modelScoreList.add(po);
            }
            index += 1;
            prevCmo = cmo;
        }
        scanner.close();
    }
    LOG.info("Totally loading {} records with invalid target records {} and invalid weight records {} in eval {}.", cnt, invalidTargetCnt, invalidWgtCnt, evalConfig.getName());
    PerformanceResult result = buildPerfResult(FPRList, catchRateList, gainList, modelScoreList, FPRWeightList, catchRateWeightList, gainWeightList);
    synchronized (this.lock) {
        if (isPrint) {
            PerformanceEvaluator.logResult(FPRList, "Bucketing False Positive Rate");
            if (hasWeight) {
                PerformanceEvaluator.logResult(FPRWeightList, "Bucketing Weighted False Positive Rate");
            }
            PerformanceEvaluator.logResult(catchRateList, "Bucketing Catch Rate");
            if (hasWeight) {
                PerformanceEvaluator.logResult(catchRateWeightList, "Bucketing Weighted Catch Rate");
            }
            PerformanceEvaluator.logResult(gainList, "Bucketing Action Rate");
            if (hasWeight) {
                PerformanceEvaluator.logResult(gainWeightList, "Bucketing Weighted Action Rate");
            }
            PerformanceEvaluator.logAucResult(result, hasWeight);
        }
        writePerResult2File(evalPerformancePath, result);
        if (isGenerateChart) {
            generateChartAndJsonPerfFiles(hasWeight, result);
        }
    }
    if (cnt == 0) {
        LOG.error("No score read, the EvalScore did not genernate or is null file");
        throw new ShifuException(ShifuErrorCode.ERROR_EVALSCORE);
    }
    return result;
}
Also used : Scanner(java.util.Scanner) Splitter(com.google.common.base.Splitter) PerformanceObject(ml.shifu.shifu.container.PerformanceObject) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) ArrayList(java.util.ArrayList) ConfusionMatrixObject(ml.shifu.shifu.container.ConfusionMatrixObject) PerformanceResult(ml.shifu.shifu.container.obj.PerformanceResult) ShifuException(ml.shifu.shifu.exception.ShifuException)

Aggregations

ShifuException (ml.shifu.shifu.exception.ShifuException)39 IOException (java.io.IOException)22 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)12 HashMap (java.util.HashMap)8 ArrayList (java.util.ArrayList)5 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)5 File (java.io.File)4 Scanner (java.util.Scanner)4 Path (org.apache.hadoop.fs.Path)4 SourceFile (ml.shifu.shifu.fs.SourceFile)3 JobStats (org.apache.pig.tools.pigstats.JobStats)3 BufferedReader (java.io.BufferedReader)2 ConfusionMatrixObject (ml.shifu.shifu.container.ConfusionMatrixObject)2 EvalConfig (ml.shifu.shifu.container.obj.EvalConfig)2 RawSourceData (ml.shifu.shifu.container.obj.RawSourceData)2 AbstractStatsExecutor (ml.shifu.shifu.core.processor.stats.AbstractStatsExecutor)2 AkkaStatsWorker (ml.shifu.shifu.core.processor.stats.AkkaStatsWorker)2 DIBStatsExecutor (ml.shifu.shifu.core.processor.stats.DIBStatsExecutor)2 MunroPatIStatsExecutor (ml.shifu.shifu.core.processor.stats.MunroPatIStatsExecutor)2 MunroPatStatsExecutor (ml.shifu.shifu.core.processor.stats.MunroPatStatsExecutor)2