use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class EvalModelProcessor method runDistScore.
/**
* run pig mode scoring
*
* @param evalConfig
* the name for evaluation
* @throws IOException
* any io exception
*/
@SuppressWarnings("deprecation")
private ScoreStatus runDistScore(EvalConfig evalConfig) throws IOException {
// clean up output directories
SourceType sourceType = evalConfig.getDataSet().getSource();
ShifuFileUtils.deleteFile(pathFinder.getEvalNormalizedPath(evalConfig), sourceType);
ShifuFileUtils.deleteFile(pathFinder.getEvalScorePath(evalConfig), sourceType);
ShifuFileUtils.deleteFile(pathFinder.getEvalPerformancePath(evalConfig), sourceType);
// prepare special parameters and execute pig
Map<String, String> paramsMap = new HashMap<String, String>();
paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString());
paramsMap.put("pathEvalRawData", evalConfig.getDataSet().getDataPath());
paramsMap.put("pathEvalNormalized", pathFinder.getEvalNormalizedPath(evalConfig));
paramsMap.put("pathEvalScore", pathFinder.getEvalScorePath(evalConfig));
paramsMap.put("pathEvalPerformance", pathFinder.getEvalPerformancePath(evalConfig));
paramsMap.put("eval_set_name", evalConfig.getName());
paramsMap.put("delimiter", CommonUtils.escapePigString(evalConfig.getDataSet().getDataDelimiter()));
paramsMap.put("columnIndex", evalConfig.getPerformanceScoreSelector().trim());
paramsMap.put("scale", Environment.getProperty(Constants.SHIFU_SCORE_SCALE, Integer.toString(Scorer.DEFAULT_SCORE_SCALE)));
String expressionsAsString = super.modelConfig.getSegmentFilterExpressionsAsString();
Environment.getProperties().put("shifu.segment.expressions", expressionsAsString);
String pigScript = "scripts/Eval.pig";
Map<String, String> confMap = new HashMap<String, String>();
// max min score folder
String maxMinScoreFolder = ShifuFileUtils.getFileSystemBySourceType(sourceType).makeQualified(new Path("tmp" + File.separator + "maxmin_score_" + System.currentTimeMillis() + "_" + RANDOM.nextLong())).toString();
confMap.put(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT, maxMinScoreFolder);
if (modelConfig.isClassification() || (isNoSort() && EvalStep.SCORE.equals(this.evalStep))) {
pigScript = "scripts/EvalScore.pig";
}
try {
PigExecutor.getExecutor().submitJob(modelConfig, pathFinder.getScriptPath(pigScript), paramsMap, evalConfig.getDataSet().getSource(), confMap, super.pathFinder);
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
} catch (Throwable e) {
throw new RuntimeException(e);
}
Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator();
while (iter.hasNext()) {
JobStats jobStats = iter.next();
long evalRecords = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_RECORDS);
LOG.info("Total valid eval records is : {}", evalRecords);
// If no basic record counter, check next one
if (evalRecords == 0L) {
continue;
}
this.evalRecords = evalRecords;
long pigPosTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_POSTAGS);
long pigNegTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_NEGTAGS);
double pigPosWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_WPOSTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d);
double pigNegWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_WNEGTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d);
LOG.info("Total positive record count is : {}", pigPosTags);
LOG.info("Total negative record count is : {}", pigNegTags);
LOG.info("Total weighted positive record count is : {}", pigPosWeightTags);
LOG.info("Total weighted negative record count is : {}", pigNegWeightTags);
long totalRunTime = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.TOTAL_MODEL_RUNTIME);
LOG.info("Avg SLA for eval model scoring is {} micro seconds", totalRunTime / evalRecords);
double maxScore = Integer.MIN_VALUE;
double minScore = Integer.MAX_VALUE;
if (modelConfig.isRegression()) {
double[] maxMinScores = locateMaxMinScoreFromFile(sourceType, maxMinScoreFolder);
maxScore = maxMinScores[0];
minScore = maxMinScores[1];
LOG.info("Raw max score is {}, raw min score is {}", maxScore, minScore);
ShifuFileUtils.deleteFile(maxMinScoreFolder, sourceType);
}
// only one pig job with such counters, return
return new ScoreStatus(pigPosTags, pigNegTags, pigPosWeightTags, pigNegWeightTags, maxScore, minScore, evalRecords);
}
return null;
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class EvalModelProcessor method runPigNormalize.
/**
* Normalize evaluation dataset based on pig distributed solution.
*
* @param evalConfig
* eval config instance
* @throws IOException
* any io exception
*/
private void runPigNormalize(EvalConfig evalConfig) throws IOException {
SourceType sourceType = evalConfig.getDataSet().getSource();
// clean up output directories
ShifuFileUtils.deleteFile(pathFinder.getEvalNormalizedPath(evalConfig), sourceType);
// prepare special parameters and execute pig
Map<String, String> paramsMap = new HashMap<String, String>();
paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString());
paramsMap.put("pathEvalRawData", evalConfig.getDataSet().getDataPath());
paramsMap.put("pathEvalNormalized", pathFinder.getEvalNormalizedPath(evalConfig));
paramsMap.put("eval_set_name", evalConfig.getName());
paramsMap.put("delimiter", evalConfig.getDataSet().getDataDelimiter());
paramsMap.put("scale", Environment.getProperty(Constants.SHIFU_SCORE_SCALE, Integer.toString(Scorer.DEFAULT_SCORE_SCALE)));
paramsMap.put(Constants.STRICT_MODE, Boolean.toString(isStrict()));
String pigScript = "scripts/EvalNorm.pig";
try {
PigExecutor.getExecutor().submitJob(modelConfig, pathFinder.getScriptPath(pigScript), paramsMap, evalConfig.getDataSet().getSource());
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class EvalModelProcessor method createNewEval.
/**
* Create a evaluation with <code>name</code>
*
* @param name
* - the evaluation set name
* @throws IOException
* any io exception
*/
private void createNewEval(String name) throws IOException {
EvalConfig evalConfig = modelConfig.getEvalConfigByName(name);
if (evalConfig != null) {
throw new ShifuException(ShifuErrorCode.ERROR_MODEL_EVALSET_ALREADY_EXIST, "EvalSet - " + name + " already exists in ModelConfig. Please use another evalset name");
}
evalConfig = new EvalConfig();
evalConfig.setName(name);
evalConfig.setDataSet(modelConfig.getDataSet().cloneRawSourceData());
// create empty <EvalSetName>Score.meta.column.names
ShifuFileUtils.createFileIfNotExists(new Path(evalConfig.getName() + Constants.DEFAULT_CHAMPIONSCORE_META_COLUMN_FILE).toString(), SourceType.LOCAL);
// create empty <EvalSetName>.meta.column.names
String namesFilePath = Constants.COLUMN_META_FOLDER_NAME + File.separator + evalConfig.getName() + "." + Constants.DEFAULT_META_COLUMN_FILE;
ShifuFileUtils.createFileIfNotExists(new Path(namesFilePath).toString(), SourceType.LOCAL);
evalConfig.getDataSet().setMetaColumnNameFile(namesFilePath);
modelConfig.getEvals().add(evalConfig);
try {
saveModelConfig();
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_WRITE_MODELCONFIG, e);
}
LOG.info("Create Eval - " + name);
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class NormalizeModelProcessor method runPigNormalize.
/**
* Running pig normalize process
*
* @throws IOException
* any IO exception.
*/
@SuppressWarnings("deprecation")
private void runPigNormalize() throws IOException {
SourceType sourceType = modelConfig.getDataSet().getSource();
ShifuFileUtils.deleteFile(pathFinder.getNormalizedDataPath(), sourceType);
ShifuFileUtils.deleteFile(pathFinder.getSelectedRawDataPath(), sourceType);
Map<String, String> paramsMap = new HashMap<String, String>();
paramsMap.put("sampleRate", modelConfig.getNormalizeSampleRate().toString());
paramsMap.put("sampleNegOnly", ((Boolean) modelConfig.isNormalizeSampleNegOnly()).toString());
paramsMap.put("delimiter", CommonUtils.escapePigString(modelConfig.getDataSetDelimiter()));
paramsMap.put("is_csv", String.valueOf(Boolean.TRUE.toString().equalsIgnoreCase(Environment.getProperty(Constants.SHIFU_OUTPUT_DATA_CSV, Boolean.FALSE.toString()))));
String expressionsAsString = super.modelConfig.getSegmentFilterExpressionsAsString();
Environment.getProperties().put("shifu.segment.expressions", expressionsAsString);
try {
String normPigPath = null;
if (modelConfig.getNormalize().getIsParquet()) {
if (modelConfig.getBasic().getPostTrainOn()) {
normPigPath = pathFinder.getScriptPath("scripts/NormalizeWithParquetAndPostTrain.pig");
} else {
log.info("Post train is disabled by 'postTrainOn=false'.");
normPigPath = pathFinder.getScriptPath("scripts/NormalizeWithParquet.pig");
}
} else {
if (modelConfig.getBasic().getPostTrainOn()) {
// this condition is for comment, no matter post train enabled or not, only norm results will be
// stored since new post train solution no need to prepare data
}
normPigPath = pathFinder.getScriptPath("scripts/Normalize.pig");
}
paramsMap.put(Constants.IS_COMPRESS, "true");
paramsMap.put(Constants.IS_NORM_FOR_CLEAN, "false");
PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap);
Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator();
while (iter.hasNext()) {
JobStats jobStats = iter.next();
if (jobStats.getHadoopCounters() != null && jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) != null) {
long totalValidCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("TOTAL_VALID_COUNT");
// If no basic record counter, check next one
if (totalValidCount == 0L) {
continue;
}
long invalidTagCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("INVALID_TAG");
log.info("Total valid records {} after filtering, invalid tag records {}.", totalValidCount, invalidTagCount);
if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) {
log.error("Too many invalid tags, please check you configuration on positive tags and negative tags.");
}
}
// only one pig job with such counters, break
break;
}
if (StringUtils.isNotBlank(modelConfig.getValidationDataSetRawPath())) {
ShifuFileUtils.deleteFile(pathFinder.getNormalizedValidationDataPath(), sourceType);
paramsMap.put(Constants.IS_COMPRESS, "false");
paramsMap.put(Constants.IS_VALIDATION_DATASET, "true");
paramsMap.put(Constants.PATH_RAW_DATA, modelConfig.getValidationDataSetRawPath());
paramsMap.put(Constants.PATH_NORMALIZED_DATA, pathFinder.getNormalizedValidationDataPath());
PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap);
}
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class NormalizeModelProcessor method run.
/**
* runner for normalization data
*/
@Override
public int run() throws Exception {
log.info("Step Start: normalize");
long start = System.currentTimeMillis();
try {
setUp(ModelStep.NORMALIZE);
syncDataToHdfs(modelConfig.getDataSet().getSource());
switch(modelConfig.getBasic().getRunMode()) {
case DIST:
case MAPRED:
runPigNormalize();
try {
autoCheckShuffleAndShuffleSize();
} catch (Exception e) {
log.warn("warn: exception in auto check shuffle size, can be ignored as no big impact", e);
}
if (this.isToShuffleData) {
// shuffling normalized data, to make data random
MapReduceShuffle shuffler = new MapReduceShuffle(this.modelConfig);
shuffler.run(this.pathFinder.getNormalizedDataPath());
}
if (CommonUtils.isTreeModel(modelConfig.getAlgorithm())) {
runDataClean(this.isToShuffleData);
}
break;
case LOCAL:
runAkkaNormalize();
break;
}
syncDataToHdfs(modelConfig.getDataSet().getSource());
clearUp(ModelStep.NORMALIZE);
} catch (ShifuException e) {
log.error("Error:" + e.getError().toString() + "; msg:" + e.getMessage(), e);
return -1;
} catch (Exception e) {
log.error("Error:" + e.getMessage(), e);
return -1;
}
log.info("Step Finished: normalize with {} ms", (System.currentTimeMillis() - start));
return 0;
}
Aggregations