Search in sources :

Example 1 with EvalConfig

use of ml.shifu.shifu.container.obj.EvalConfig in project shifu by ShifuML.

the class ModelInspector method probe.

/**
 * Probe the status of model for each step.
 * It will check the setting in @ModelConfig to make sure all setting from user are correct.
 * After that it will do different checking for different steps
 *
 * @param modelConfig
 *            - the model configuration that want to probe
 * @param modelStep
 *            - the steps
 * @return the result of probe
 *         if everything is OK, the status of ValidateResult is TRUE
 *         else the status of ValidateResult is FALSE, and the reasons will in the clauses of ValidateResult
 * @throws Exception
 *             any exception in validation
 */
public ValidateResult probe(ModelConfig modelConfig, ModelStep modelStep) throws Exception {
    ValidateResult result = checkMeta(modelConfig);
    if (!result.getStatus()) {
        return result;
    }
    if (modelConfig.isClassification()) {
        if (modelConfig.getBasic().getRunMode() == RunMode.LOCAL || modelConfig.getDataSet().getSource() == SourceType.LOCAL) {
            ValidateResult tmpResult = new ValidateResult(true);
            tmpResult.addCause("Multiple classification is only effective in MAPRED runmode and HDFS source type.");
            result = ValidateResult.mergeResult(result, tmpResult);
        }
    }
    if (modelConfig.getDataSet().getSource() == SourceType.LOCAL && modelConfig.isMapReduceRunMode()) {
        ValidateResult tmpResult = new ValidateResult(true);
        result = ValidateResult.mergeResult(result, tmpResult);
    }
    if (ModelStep.INIT.equals(modelStep)) {
        // in INIT, only check if data or header are there or not
        result = ValidateResult.mergeResult(result, checkRawData(modelConfig.getDataSet(), "Train Set:"));
    } else if (ModelStep.STATS.equals(modelStep)) {
        result = ValidateResult.mergeResult(result, checkFile("ColumnConfig.json", SourceType.LOCAL, "ColumnConfig.json : "));
        result = ValidateResult.mergeResult(result, checkStatsConf(modelConfig));
        // verify categorical name file
        if (StringUtils.isNotBlank(modelConfig.getDataSet().getCategoricalColumnNameFile())) {
            result = ValidateResult.mergeResult(result, checkFile(modelConfig.getDataSet().getCategoricalColumnNameFile(), SourceType.LOCAL, "categorical columns configuration "));
        }
        // verify meta name file
        if (StringUtils.isNotBlank(modelConfig.getDataSet().getMetaColumnNameFile())) {
            result = ValidateResult.mergeResult(result, checkFile(modelConfig.getDataSet().getMetaColumnNameFile(), SourceType.LOCAL, "meta columns configuration "));
        }
        // check column stats
        if (result.getStatus()) {
            result = ValidateResult.mergeResult(result, checkColumnConf(modelConfig));
        }
    } else if (ModelStep.VARSELECT.equals(modelStep)) {
        result = ValidateResult.mergeResult(result, checkVarSelect(modelConfig, modelConfig.getVarSelect()));
        if (result.getStatus()) {
            // user may add configure file between steps
            // add validation to avoid user to make mistake
            result = ValidateResult.mergeResult(result, checkColumnConf(modelConfig));
        }
    } else if (ModelStep.NORMALIZE.equals(modelStep)) {
        result = ValidateResult.mergeResult(result, checkNormSetting(modelConfig, modelConfig.getNormalize()));
    } else if (ModelStep.TRAIN.equals(modelStep)) {
        result = ValidateResult.mergeResult(result, checkTrainSetting(modelConfig, modelConfig.getTrain()));
        if (modelConfig.isClassification() && modelConfig.getTrain().getMultiClassifyMethod() == MultipleClassification.NATIVE) {
            if (!"nn".equalsIgnoreCase((modelConfig.getTrain().getAlgorithm())) && !CommonConstants.RF_ALG_NAME.equalsIgnoreCase(modelConfig.getTrain().getAlgorithm())) {
                ValidateResult tmpResult = new ValidateResult(true);
                tmpResult.addCause("Native multiple classification is only effective in neural network (nn) or random forest (rf) training method.");
                result = ValidateResult.mergeResult(result, tmpResult);
            }
        }
        if (modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll()) {
            if (!CommonUtils.isTreeModel(modelConfig.getAlgorithm()) && !modelConfig.getAlgorithm().equalsIgnoreCase("nn")) {
                ValidateResult tmpResult = new ValidateResult(true);
                tmpResult.addCause("OneVSAll multiple classification is only effective in gradient boosted trees (GBT) or random forest (RF) or Neural Network (NN) training method.");
                result = ValidateResult.mergeResult(result, tmpResult);
            }
        }
    } else if (ModelStep.POSTTRAIN.equals(modelStep)) {
    // TODO
    } else if (ModelStep.EVAL.equals(modelStep)) {
        if (CollectionUtils.isNotEmpty(modelConfig.getEvals())) {
            for (EvalConfig evalConfig : modelConfig.getEvals()) {
                result = ValidateResult.mergeResult(result, checkRawData(evalConfig.getDataSet(), "Eval Set - " + evalConfig.getName() + ": "));
                if (StringUtils.isNotBlank(evalConfig.getScoreMetaColumnNameFile())) {
                    result = ValidateResult.mergeResult(result, checkFile(evalConfig.getScoreMetaColumnNameFile(), SourceType.LOCAL, "Eval Set - " + evalConfig.getName() + ": "));
                }
            }
        }
    }
    return result;
}
Also used : EvalConfig(ml.shifu.shifu.container.obj.EvalConfig) ValidateResult(ml.shifu.shifu.container.meta.ValidateResult)

Example 2 with EvalConfig

use of ml.shifu.shifu.container.obj.EvalConfig in project shifu by ShifuML.

the class EvalModelProcessor method deleteEvalSet.

private void deleteEvalSet(String evalSetName) {
    EvalConfig evalConfig = modelConfig.getEvalConfigByName(evalSetName);
    if (evalConfig == null) {
        LOG.error("{} eval set doesn't exist.", evalSetName);
    } else {
        modelConfig.getEvals().remove(evalConfig);
        try {
            saveModelConfig();
        } catch (IOException e) {
            throw new ShifuException(ShifuErrorCode.ERROR_WRITE_MODELCONFIG, e);
        }
        LOG.info("Done. Delete eval set - " + evalSetName);
    }
}
Also used : EvalConfig(ml.shifu.shifu.container.obj.EvalConfig) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 3 with EvalConfig

use of ml.shifu.shifu.container.obj.EvalConfig in project shifu by ShifuML.

the class EvalModelProcessor method runScore.

/**
 * run score only
 *
 * @param evalSetList
 *            eval config list
 * @throws IOException
 *             any io exception
 */
private void runScore(List<EvalConfig> evalSetList) throws IOException {
    // do the validation before scoring the data set
    for (EvalConfig evalConfig : evalSetList) {
        validateEvalColumnConfig(evalConfig);
    }
    // do it only once
    syncDataToHdfs(evalSetList);
    if (Environment.getBoolean(Constants.SHIFU_EVAL_PARALLEL, true) && modelConfig.isMapReduceRunMode() && evalSetList.size() > 1) {
        // run in parallel
        int parallelNum = Environment.getInt(Constants.SHIFU_EVAL_PARALLEL_NUM, 5);
        if (parallelNum <= 0 || parallelNum > 100) {
            throw new IllegalArgumentException(Constants.SHIFU_EVAL_PARALLEL_NUM + " in shifuconfig should be in (0, 100], by default it is 5.");
        }
        int evalSize = evalSetList.size();
        int mod = evalSize % parallelNum;
        int batch = evalSize / parallelNum;
        batch = (mod == 0 ? batch : (batch + 1));
        for (int i = 0; i < batch; i++) {
            int batchSize = (mod != 0 && i == (batch - 1)) ? mod : parallelNum;
            // lunch current batch size
            LOG.info("Starting to run eval score in {}/{} round", (i + 1), batch);
            final CountDownLatch cdl = new CountDownLatch(batchSize);
            for (int j = 0; j < batchSize; j++) {
                int currentIndex = i * parallelNum + j;
                final EvalConfig config = evalSetList.get(currentIndex);
                // save tmp models
                Thread evalRunThread = new Thread(new Runnable() {

                    @Override
                    public void run() {
                        try {
                            runScore(config);
                        } catch (IOException e) {
                            LOG.error("Exception in eval score:", e);
                        } catch (Exception e) {
                            LOG.error("Exception in eval score:", e);
                        }
                        cdl.countDown();
                    }
                }, config.getName());
                // print eval name to log4j console to make each one is easy to be get from logs
                evalRunThread.start();
                // each one sleep 4s to avoid conflict in initialization
                try {
                    Thread.sleep(4000);
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                }
            }
            LOG.info("Starting to wait eval score in {}/{} round", (i + 1), batch);
            // await all threads done
            try {
                cdl.await();
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            }
            LOG.info("Finish eval score in {}/{} round", (i + 1), batch);
        }
        LOG.info("Finish all eval score parallel running with eval size {}.", evalSize);
    } else {
        // for old sequential runs
        for (final EvalConfig config : evalSetList) {
            runScore(config);
        }
    }
}
Also used : EvalConfig(ml.shifu.shifu.container.obj.EvalConfig) IOException(java.io.IOException) CountDownLatch(java.util.concurrent.CountDownLatch) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 4 with EvalConfig

use of ml.shifu.shifu.container.obj.EvalConfig in project shifu by ShifuML.

the class ModelDataEncodeProcessor method updateEvalEncodeDataPath.

private void updateEvalEncodeDataPath(int status, String encodeRefModel, EvalConfig evalConfig) throws IOException {
    if (status == 0 && StringUtils.isNotBlank(encodeRefModel)) {
        String delimiter = Environment.getProperty(Constants.SHIFU_OUTPUT_DATA_DELIMITER, Constants.DEFAULT_DELIMITER);
        String encodeDataPath = this.pathFinder.getEncodeDataPath(evalConfig);
        ModelConfig encodeModel = loadSubModelConfig(encodeRefModel);
        EvalConfig encodeEvalConfig = encodeModel.getEvalConfigByName(evalConfig.getName());
        if (encodeEvalConfig == null) {
            // new EvalSet, add it to encode model
            encodeEvalConfig = evalConfig.clone();
            encodeModel.getEvals().add(encodeEvalConfig);
        }
        encodeEvalConfig.getDataSet().setDataPath(encodeDataPath);
        encodeEvalConfig.getDataSet().setDataDelimiter(delimiter);
        encodeEvalConfig.getDataSet().setHeaderPath(encodeDataPath + File.separator + ".pig_header");
        encodeEvalConfig.getDataSet().setHeaderDelimiter(delimiter);
        // remove filter
        encodeEvalConfig.getDataSet().setFilterExpressions("");
        saveModelConfig(encodeRefModel, encodeModel);
    }
}
Also used : EvalConfig(ml.shifu.shifu.container.obj.EvalConfig) ModelConfig(ml.shifu.shifu.container.obj.ModelConfig)

Example 5 with EvalConfig

use of ml.shifu.shifu.container.obj.EvalConfig in project shifu by ShifuML.

the class CommonUtils method copyEvalDataFromLocalToHDFS.

/**
 * Sync-up the evaluation data into HDFS
 *
 * @param modelConfig
 *            - ModelConfig
 * @param evalName
 *            eval name in ModelConfig
 * @throws IOException
 *             - error occur when copying data
 */
@SuppressWarnings("deprecation")
public static void copyEvalDataFromLocalToHDFS(ModelConfig modelConfig, String evalName) throws IOException {
    EvalConfig evalConfig = modelConfig.getEvalConfigByName(evalName);
    if (evalConfig != null) {
        FileSystem hdfs = HDFSUtils.getFS();
        FileSystem localFs = HDFSUtils.getLocalFS();
        PathFinder pathFinder = new PathFinder(modelConfig);
        Path evalDir = new Path(pathFinder.getEvalSetPath(evalConfig, SourceType.LOCAL));
        Path dst = new Path(pathFinder.getEvalSetPath(evalConfig, SourceType.HDFS));
        if (// local evaluation folder exists
        localFs.exists(evalDir) && // is directory
        localFs.getFileStatus(evalDir).isDir() && !hdfs.exists(dst)) {
            hdfs.copyFromLocalFile(evalDir, dst);
        }
        if (StringUtils.isNotBlank(evalConfig.getScoreMetaColumnNameFile())) {
            hdfs.copyFromLocalFile(new Path(evalConfig.getScoreMetaColumnNameFile()), new Path(pathFinder.getEvalSetPath(evalConfig)));
        }
        // sync evaluation meta.column.file to hdfs
        if (StringUtils.isNotBlank(evalConfig.getDataSet().getMetaColumnNameFile())) {
            hdfs.copyFromLocalFile(new Path(evalConfig.getDataSet().getMetaColumnNameFile()), new Path(pathFinder.getEvalSetPath(evalConfig)));
        }
    }
}
Also used : EvalConfig(ml.shifu.shifu.container.obj.EvalConfig) FileSystem(org.apache.hadoop.fs.FileSystem) PathFinder(ml.shifu.shifu.fs.PathFinder)

Aggregations

EvalConfig (ml.shifu.shifu.container.obj.EvalConfig)13 IOException (java.io.IOException)5 ShifuException (ml.shifu.shifu.exception.ShifuException)5 ModelConfig (ml.shifu.shifu.container.obj.ModelConfig)3 Test (org.testng.annotations.Test)3 File (java.io.File)2 CountDownLatch (java.util.concurrent.CountDownLatch)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 ValidateResult (ml.shifu.shifu.container.meta.ValidateResult)1 RawSourceData (ml.shifu.shifu.container.obj.RawSourceData)1 PerformanceEvaluator (ml.shifu.shifu.core.PerformanceEvaluator)1 PathFinder (ml.shifu.shifu.fs.PathFinder)1 SourceFile (ml.shifu.shifu.fs.SourceFile)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 Path (org.apache.hadoop.fs.Path)1 AfterTest (org.testng.annotations.AfterTest)1