use of ml.shifu.shifu.core.processor.stats.MapReducerStatsWorker in project shifu by ShifuML.
the class StatsModelProcessor method run.
/**
* runner for statistics
*/
@Override
public int run() throws Exception {
log.info("Step Start: stats");
long start = System.currentTimeMillis();
try {
// 0. set up and sync to HDFS
setUp(ModelStep.STATS);
// resync ModelConfig.json/ColumnConfig.json to HDFS
syncDataToHdfs(modelConfig.getDataSet().getSource());
if (getBooleanParam(this.params, Constants.IS_COMPUTE_CORR)) {
// 1. validate if run stats before run stats -correlation
boolean foundValidMeanValueColumn = isMeanCalculated();
if (!foundValidMeanValueColumn) {
log.warn("Some mean value of column is null, could you check if you run 'shifu stats'.");
return -1;
}
// 2. compute correlation
log.info("Start computing correlation value ...");
SourceType source = this.modelConfig.getDataSet().getSource();
String corrPath = super.getPathFinder().getCorrelationPath(source);
// check if can start from existing output
boolean reuseCorrResult = Environment.getBoolean("shifu.stats.corr.reuse", Boolean.FALSE);
if (reuseCorrResult && ShifuFileUtils.isFileExists(corrPath, SourceType.HDFS)) {
dumpAndCalculateCorrelationResult(source, corrPath);
} else {
runCorrMapReduceJob();
}
// 3. save column config list
saveColumnConfigList();
} else if (getBooleanParam(this.params, Constants.IS_COMPUTE_PSI)) {
boolean foundValidMeanValueColumn = isMeanCalculated();
if (!foundValidMeanValueColumn) {
log.warn("Some mean value of column is null, could you check if you run 'shifu stats'.");
return -1;
}
if (StringUtils.isNotEmpty(modelConfig.getPsiColumnName())) {
new MapReducerStatsWorker(this, modelConfig, columnConfigList).runPSI();
// save column config list after running PSI successfully
saveColumnConfigList();
} else {
log.warn("To Run PSI please set your PSI column in dataSet::psiColumnName.");
}
} else if (getBooleanParam(this.params, Constants.IS_REBIN)) {
// run the re-binning
String backupColumnConfigPath = this.pathFinder.getBackupColumnConfig();
if (!ShifuFileUtils.isFileExists(new Path(backupColumnConfigPath), SourceType.LOCAL)) {
ShifuFileUtils.createDirIfNotExists(new SourceFile(Constants.TMP, SourceType.LOCAL));
saveColumnConfigList(backupColumnConfigPath, this.columnConfigList);
} else {
// existing backup ColumnConfig.json, use binning info in it to do rebin
List<ColumnConfig> backColumnConfigList = CommonUtils.loadColumnConfigList(backupColumnConfigPath, SourceType.LOCAL, false);
for (ColumnConfig backupColumnConfig : backColumnConfigList) {
for (ColumnConfig columnConfig : this.columnConfigList) {
if (NSColumnUtils.isColumnEqual(backupColumnConfig.getColumnName(), columnConfig.getColumnName())) {
columnConfig.setColumnBinning(backupColumnConfig.getColumnBinning());
}
}
}
}
// user provide candidate variable list or not
boolean hasCandidates = CommonUtils.hasCandidateColumns(this.columnConfigList);
List<ColumnConfig> rebinColumns = new ArrayList<ColumnConfig>();
List<String> catVariables = getStringList(this.params, Constants.REQUEST_VARS, ",");
for (ColumnConfig columnConfig : this.columnConfigList) {
if (CollectionUtils.isEmpty(catVariables) || isRequestColumn(catVariables, columnConfig)) {
if (CommonUtils.isGoodCandidate(columnConfig, hasCandidates)) {
rebinColumns.add(columnConfig);
} else {
log.warn("Column - {} is not a good candidate. Skip it.", columnConfig.getColumnName());
}
}
}
if (CollectionUtils.isNotEmpty(rebinColumns)) {
for (ColumnConfig columnConfig : rebinColumns) {
doReBin(columnConfig);
}
}
// use the merge ColumnConfig.json to replace current one
saveColumnConfigList();
} else {
AbstractStatsExecutor statsExecutor = null;
if (modelConfig.isMapReduceRunMode()) {
if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.DynamicBinning)) {
statsExecutor = new DIBStatsExecutor(this, modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPat)) {
statsExecutor = new MunroPatStatsExecutor(this, modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPatI)) {
statsExecutor = new MunroPatIStatsExecutor(this, modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDT)) {
statsExecutor = new SPDTStatsExecutor(this, modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDTI)) {
statsExecutor = new SPDTIStatsExecutor(this, modelConfig, columnConfigList);
} else {
statsExecutor = new SPDTIStatsExecutor(this, modelConfig, columnConfigList);
}
} else if (modelConfig.isLocalRunMode()) {
statsExecutor = new AkkaStatsWorker(this, modelConfig, columnConfigList);
} else {
throw new ShifuException(ShifuErrorCode.ERROR_UNSUPPORT_MODE);
}
statsExecutor.doStats();
// update the backup ColumnConfig.json after running stats
String backupColumnConfigPath = this.pathFinder.getBackupColumnConfig();
ShifuFileUtils.createDirIfNotExists(new SourceFile(Constants.TMP, SourceType.LOCAL));
saveColumnConfigList(backupColumnConfigPath, this.columnConfigList);
}
// back up current column config each time as stats will always change CC.json
this.backupCurrentColumnConfigToLocal(SDF.format(new Date()));
syncDataToHdfs(modelConfig.getDataSet().getSource());
clearUp(ModelStep.STATS);
} catch (ShifuException e) {
log.error("Error:" + e.getError().toString() + "; msg:" + e.getMessage(), e);
return -1;
} catch (Exception e) {
log.error("Error:" + e.getMessage(), e);
return -1;
}
log.info("Step Finished: stats with {} ms", (System.currentTimeMillis() - start));
return 0;
}
Aggregations