use of ml.shifu.shifu.container.obj.EvalConfig in project shifu by ShifuML.
the class ModelInspector method probe.
/**
* Probe the status of model for each step.
* It will check the setting in @ModelConfig to make sure all setting from user are correct.
* After that it will do different checking for different steps
*
* @param modelConfig
* - the model configuration that want to probe
* @param modelStep
* - the steps
* @return the result of probe
* if everything is OK, the status of ValidateResult is TRUE
* else the status of ValidateResult is FALSE, and the reasons will in the clauses of ValidateResult
* @throws Exception
* any exception in validation
*/
public ValidateResult probe(ModelConfig modelConfig, ModelStep modelStep) throws Exception {
ValidateResult result = checkMeta(modelConfig);
if (!result.getStatus()) {
return result;
}
if (modelConfig.isClassification()) {
if (modelConfig.getBasic().getRunMode() == RunMode.LOCAL || modelConfig.getDataSet().getSource() == SourceType.LOCAL) {
ValidateResult tmpResult = new ValidateResult(true);
tmpResult.addCause("Multiple classification is only effective in MAPRED runmode and HDFS source type.");
result = ValidateResult.mergeResult(result, tmpResult);
}
}
if (modelConfig.getDataSet().getSource() == SourceType.LOCAL && modelConfig.isMapReduceRunMode()) {
ValidateResult tmpResult = new ValidateResult(true);
result = ValidateResult.mergeResult(result, tmpResult);
}
if (ModelStep.INIT.equals(modelStep)) {
// in INIT, only check if data or header are there or not
result = ValidateResult.mergeResult(result, checkRawData(modelConfig.getDataSet(), "Train Set:"));
} else if (ModelStep.STATS.equals(modelStep)) {
result = ValidateResult.mergeResult(result, checkFile("ColumnConfig.json", SourceType.LOCAL, "ColumnConfig.json : "));
result = ValidateResult.mergeResult(result, checkStatsConf(modelConfig));
// verify categorical name file
if (StringUtils.isNotBlank(modelConfig.getDataSet().getCategoricalColumnNameFile())) {
result = ValidateResult.mergeResult(result, checkFile(modelConfig.getDataSet().getCategoricalColumnNameFile(), SourceType.LOCAL, "categorical columns configuration "));
}
// verify meta name file
if (StringUtils.isNotBlank(modelConfig.getDataSet().getMetaColumnNameFile())) {
result = ValidateResult.mergeResult(result, checkFile(modelConfig.getDataSet().getMetaColumnNameFile(), SourceType.LOCAL, "meta columns configuration "));
}
// check column stats
if (result.getStatus()) {
result = ValidateResult.mergeResult(result, checkColumnConf(modelConfig));
}
} else if (ModelStep.VARSELECT.equals(modelStep)) {
result = ValidateResult.mergeResult(result, checkVarSelect(modelConfig, modelConfig.getVarSelect()));
if (result.getStatus()) {
// user may add configure file between steps
// add validation to avoid user to make mistake
result = ValidateResult.mergeResult(result, checkColumnConf(modelConfig));
}
} else if (ModelStep.NORMALIZE.equals(modelStep)) {
result = ValidateResult.mergeResult(result, checkNormSetting(modelConfig, modelConfig.getNormalize()));
} else if (ModelStep.TRAIN.equals(modelStep)) {
result = ValidateResult.mergeResult(result, checkTrainSetting(modelConfig, modelConfig.getTrain()));
if (modelConfig.isClassification() && modelConfig.getTrain().getMultiClassifyMethod() == MultipleClassification.NATIVE) {
if (!"nn".equalsIgnoreCase((modelConfig.getTrain().getAlgorithm())) && !CommonConstants.RF_ALG_NAME.equalsIgnoreCase(modelConfig.getTrain().getAlgorithm())) {
ValidateResult tmpResult = new ValidateResult(true);
tmpResult.addCause("Native multiple classification is only effective in neural network (nn) or random forest (rf) training method.");
result = ValidateResult.mergeResult(result, tmpResult);
}
}
if (modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll()) {
if (!CommonUtils.isTreeModel(modelConfig.getAlgorithm()) && !modelConfig.getAlgorithm().equalsIgnoreCase("nn")) {
ValidateResult tmpResult = new ValidateResult(true);
tmpResult.addCause("OneVSAll multiple classification is only effective in gradient boosted trees (GBT) or random forest (RF) or Neural Network (NN) training method.");
result = ValidateResult.mergeResult(result, tmpResult);
}
}
} else if (ModelStep.POSTTRAIN.equals(modelStep)) {
// TODO
} else if (ModelStep.EVAL.equals(modelStep)) {
if (CollectionUtils.isNotEmpty(modelConfig.getEvals())) {
for (EvalConfig evalConfig : modelConfig.getEvals()) {
result = ValidateResult.mergeResult(result, checkRawData(evalConfig.getDataSet(), "Eval Set - " + evalConfig.getName() + ": "));
if (StringUtils.isNotBlank(evalConfig.getScoreMetaColumnNameFile())) {
result = ValidateResult.mergeResult(result, checkFile(evalConfig.getScoreMetaColumnNameFile(), SourceType.LOCAL, "Eval Set - " + evalConfig.getName() + ": "));
}
}
}
}
return result;
}
use of ml.shifu.shifu.container.obj.EvalConfig in project shifu by ShifuML.
the class EvalModelProcessor method deleteEvalSet.
private void deleteEvalSet(String evalSetName) {
EvalConfig evalConfig = modelConfig.getEvalConfigByName(evalSetName);
if (evalConfig == null) {
LOG.error("{} eval set doesn't exist.", evalSetName);
} else {
modelConfig.getEvals().remove(evalConfig);
try {
saveModelConfig();
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_WRITE_MODELCONFIG, e);
}
LOG.info("Done. Delete eval set - " + evalSetName);
}
}
use of ml.shifu.shifu.container.obj.EvalConfig in project shifu by ShifuML.
the class EvalModelProcessor method runScore.
/**
* run score only
*
* @param evalSetList
* eval config list
* @throws IOException
* any io exception
*/
private void runScore(List<EvalConfig> evalSetList) throws IOException {
// do the validation before scoring the data set
for (EvalConfig evalConfig : evalSetList) {
validateEvalColumnConfig(evalConfig);
}
// do it only once
syncDataToHdfs(evalSetList);
if (Environment.getBoolean(Constants.SHIFU_EVAL_PARALLEL, true) && modelConfig.isMapReduceRunMode() && evalSetList.size() > 1) {
// run in parallel
int parallelNum = Environment.getInt(Constants.SHIFU_EVAL_PARALLEL_NUM, 5);
if (parallelNum <= 0 || parallelNum > 100) {
throw new IllegalArgumentException(Constants.SHIFU_EVAL_PARALLEL_NUM + " in shifuconfig should be in (0, 100], by default it is 5.");
}
int evalSize = evalSetList.size();
int mod = evalSize % parallelNum;
int batch = evalSize / parallelNum;
batch = (mod == 0 ? batch : (batch + 1));
for (int i = 0; i < batch; i++) {
int batchSize = (mod != 0 && i == (batch - 1)) ? mod : parallelNum;
// lunch current batch size
LOG.info("Starting to run eval score in {}/{} round", (i + 1), batch);
final CountDownLatch cdl = new CountDownLatch(batchSize);
for (int j = 0; j < batchSize; j++) {
int currentIndex = i * parallelNum + j;
final EvalConfig config = evalSetList.get(currentIndex);
// save tmp models
Thread evalRunThread = new Thread(new Runnable() {
@Override
public void run() {
try {
runScore(config);
} catch (IOException e) {
LOG.error("Exception in eval score:", e);
} catch (Exception e) {
LOG.error("Exception in eval score:", e);
}
cdl.countDown();
}
}, config.getName());
// print eval name to log4j console to make each one is easy to be get from logs
evalRunThread.start();
// each one sleep 4s to avoid conflict in initialization
try {
Thread.sleep(4000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
LOG.info("Starting to wait eval score in {}/{} round", (i + 1), batch);
// await all threads done
try {
cdl.await();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
LOG.info("Finish eval score in {}/{} round", (i + 1), batch);
}
LOG.info("Finish all eval score parallel running with eval size {}.", evalSize);
} else {
// for old sequential runs
for (final EvalConfig config : evalSetList) {
runScore(config);
}
}
}
use of ml.shifu.shifu.container.obj.EvalConfig in project shifu by ShifuML.
the class ModelDataEncodeProcessor method updateEvalEncodeDataPath.
private void updateEvalEncodeDataPath(int status, String encodeRefModel, EvalConfig evalConfig) throws IOException {
if (status == 0 && StringUtils.isNotBlank(encodeRefModel)) {
String delimiter = Environment.getProperty(Constants.SHIFU_OUTPUT_DATA_DELIMITER, Constants.DEFAULT_DELIMITER);
String encodeDataPath = this.pathFinder.getEncodeDataPath(evalConfig);
ModelConfig encodeModel = loadSubModelConfig(encodeRefModel);
EvalConfig encodeEvalConfig = encodeModel.getEvalConfigByName(evalConfig.getName());
if (encodeEvalConfig == null) {
// new EvalSet, add it to encode model
encodeEvalConfig = evalConfig.clone();
encodeModel.getEvals().add(encodeEvalConfig);
}
encodeEvalConfig.getDataSet().setDataPath(encodeDataPath);
encodeEvalConfig.getDataSet().setDataDelimiter(delimiter);
encodeEvalConfig.getDataSet().setHeaderPath(encodeDataPath + File.separator + ".pig_header");
encodeEvalConfig.getDataSet().setHeaderDelimiter(delimiter);
// remove filter
encodeEvalConfig.getDataSet().setFilterExpressions("");
saveModelConfig(encodeRefModel, encodeModel);
}
}
use of ml.shifu.shifu.container.obj.EvalConfig in project shifu by ShifuML.
the class CommonUtils method copyEvalDataFromLocalToHDFS.
/**
* Sync-up the evaluation data into HDFS
*
* @param modelConfig
* - ModelConfig
* @param evalName
* eval name in ModelConfig
* @throws IOException
* - error occur when copying data
*/
@SuppressWarnings("deprecation")
public static void copyEvalDataFromLocalToHDFS(ModelConfig modelConfig, String evalName) throws IOException {
EvalConfig evalConfig = modelConfig.getEvalConfigByName(evalName);
if (evalConfig != null) {
FileSystem hdfs = HDFSUtils.getFS();
FileSystem localFs = HDFSUtils.getLocalFS();
PathFinder pathFinder = new PathFinder(modelConfig);
Path evalDir = new Path(pathFinder.getEvalSetPath(evalConfig, SourceType.LOCAL));
Path dst = new Path(pathFinder.getEvalSetPath(evalConfig, SourceType.HDFS));
if (// local evaluation folder exists
localFs.exists(evalDir) && // is directory
localFs.getFileStatus(evalDir).isDir() && !hdfs.exists(dst)) {
hdfs.copyFromLocalFile(evalDir, dst);
}
if (StringUtils.isNotBlank(evalConfig.getScoreMetaColumnNameFile())) {
hdfs.copyFromLocalFile(new Path(evalConfig.getScoreMetaColumnNameFile()), new Path(pathFinder.getEvalSetPath(evalConfig)));
}
// sync evaluation meta.column.file to hdfs
if (StringUtils.isNotBlank(evalConfig.getDataSet().getMetaColumnNameFile())) {
hdfs.copyFromLocalFile(new Path(evalConfig.getDataSet().getMetaColumnNameFile()), new Path(pathFinder.getEvalSetPath(evalConfig)));
}
}
}
Aggregations