use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class StatsModelProcessor method run.
/**
* runner for statistics
*/
@Override
public int run() throws Exception {
log.info("Step Start: stats");
long start = System.currentTimeMillis();
try {
// 0. set up and sync to HDFS
setUp(ModelStep.STATS);
// resync ModelConfig.json/ColumnConfig.json to HDFS
syncDataToHdfs(modelConfig.getDataSet().getSource());
if (getBooleanParam(this.params, Constants.IS_COMPUTE_CORR)) {
// 1. validate if run stats before run stats -correlation
boolean foundValidMeanValueColumn = isMeanCalculated();
if (!foundValidMeanValueColumn) {
log.warn("Some mean value of column is null, could you check if you run 'shifu stats'.");
return -1;
}
// 2. compute correlation
log.info("Start computing correlation value ...");
SourceType source = this.modelConfig.getDataSet().getSource();
String corrPath = super.getPathFinder().getCorrelationPath(source);
// check if can start from existing output
boolean reuseCorrResult = Environment.getBoolean("shifu.stats.corr.reuse", Boolean.FALSE);
if (reuseCorrResult && ShifuFileUtils.isFileExists(corrPath, SourceType.HDFS)) {
dumpAndCalculateCorrelationResult(source, corrPath);
} else {
runCorrMapReduceJob();
}
// 3. save column config list
saveColumnConfigList();
} else if (getBooleanParam(this.params, Constants.IS_COMPUTE_PSI)) {
boolean foundValidMeanValueColumn = isMeanCalculated();
if (!foundValidMeanValueColumn) {
log.warn("Some mean value of column is null, could you check if you run 'shifu stats'.");
return -1;
}
if (StringUtils.isNotEmpty(modelConfig.getPsiColumnName())) {
new MapReducerStatsWorker(this, modelConfig, columnConfigList).runPSI();
// save column config list after running PSI successfully
saveColumnConfigList();
} else {
log.warn("To Run PSI please set your PSI column in dataSet::psiColumnName.");
}
} else if (getBooleanParam(this.params, Constants.IS_REBIN)) {
// run the re-binning
String backupColumnConfigPath = this.pathFinder.getBackupColumnConfig();
if (!ShifuFileUtils.isFileExists(new Path(backupColumnConfigPath), SourceType.LOCAL)) {
ShifuFileUtils.createDirIfNotExists(new SourceFile(Constants.TMP, SourceType.LOCAL));
saveColumnConfigList(backupColumnConfigPath, this.columnConfigList);
} else {
// existing backup ColumnConfig.json, use binning info in it to do rebin
List<ColumnConfig> backColumnConfigList = CommonUtils.loadColumnConfigList(backupColumnConfigPath, SourceType.LOCAL, false);
for (ColumnConfig backupColumnConfig : backColumnConfigList) {
for (ColumnConfig columnConfig : this.columnConfigList) {
if (NSColumnUtils.isColumnEqual(backupColumnConfig.getColumnName(), columnConfig.getColumnName())) {
columnConfig.setColumnBinning(backupColumnConfig.getColumnBinning());
}
}
}
}
// user provide candidate variable list or not
boolean hasCandidates = CommonUtils.hasCandidateColumns(this.columnConfigList);
List<ColumnConfig> rebinColumns = new ArrayList<ColumnConfig>();
List<String> catVariables = getStringList(this.params, Constants.REQUEST_VARS, ",");
for (ColumnConfig columnConfig : this.columnConfigList) {
if (CollectionUtils.isEmpty(catVariables) || isRequestColumn(catVariables, columnConfig)) {
if (CommonUtils.isGoodCandidate(columnConfig, hasCandidates)) {
rebinColumns.add(columnConfig);
} else {
log.warn("Column - {} is not a good candidate. Skip it.", columnConfig.getColumnName());
}
}
}
if (CollectionUtils.isNotEmpty(rebinColumns)) {
for (ColumnConfig columnConfig : rebinColumns) {
doReBin(columnConfig);
}
}
// use the merge ColumnConfig.json to replace current one
saveColumnConfigList();
} else {
AbstractStatsExecutor statsExecutor = null;
if (modelConfig.isMapReduceRunMode()) {
if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.DynamicBinning)) {
statsExecutor = new DIBStatsExecutor(this, modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPat)) {
statsExecutor = new MunroPatStatsExecutor(this, modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPatI)) {
statsExecutor = new MunroPatIStatsExecutor(this, modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDT)) {
statsExecutor = new SPDTStatsExecutor(this, modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDTI)) {
statsExecutor = new SPDTIStatsExecutor(this, modelConfig, columnConfigList);
} else {
statsExecutor = new SPDTIStatsExecutor(this, modelConfig, columnConfigList);
}
} else if (modelConfig.isLocalRunMode()) {
statsExecutor = new AkkaStatsWorker(this, modelConfig, columnConfigList);
} else {
throw new ShifuException(ShifuErrorCode.ERROR_UNSUPPORT_MODE);
}
statsExecutor.doStats();
// update the backup ColumnConfig.json after running stats
String backupColumnConfigPath = this.pathFinder.getBackupColumnConfig();
ShifuFileUtils.createDirIfNotExists(new SourceFile(Constants.TMP, SourceType.LOCAL));
saveColumnConfigList(backupColumnConfigPath, this.columnConfigList);
}
// back up current column config each time as stats will always change CC.json
this.backupCurrentColumnConfigToLocal(SDF.format(new Date()));
syncDataToHdfs(modelConfig.getDataSet().getSource());
clearUp(ModelStep.STATS);
} catch (ShifuException e) {
log.error("Error:" + e.getError().toString() + "; msg:" + e.getMessage(), e);
return -1;
} catch (Exception e) {
log.error("Error:" + e.getMessage(), e);
return -1;
}
log.info("Step Finished: stats with {} ms", (System.currentTimeMillis() - start));
return 0;
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class VarSelectModelProcessor method run.
/**
* Run for the variable selection
*/
@Override
public int run() throws Exception {
log.info("Step Start: varselect");
long start = System.currentTimeMillis();
try {
setUp(ModelStep.VARSELECT);
validateParameters();
// reset all selections if user specify or select by absolute number
if (getIsToReset()) {
log.info("Reset all selections data including type final select etc!");
resetAllFinalSelect();
} else if (getIsToList()) {
log.info("Below variables are selected - ");
for (ColumnConfig columnConfig : this.columnConfigList) {
if (columnConfig.isFinalSelect()) {
log.info(columnConfig.getColumnName());
}
}
log.info("----- Done -----");
} else if (getIsToAutoFilter()) {
log.info("Start to run variable auto filter.");
runAutoVarFilter();
log.info("----- Done -----");
} else if (getIsRecoverAuto()) {
String varselHistory = pathFinder.getVarSelHistory();
if (ShifuFileUtils.isFileExists(varselHistory, SourceType.LOCAL)) {
log.info("!!! Auto filtered variables will be recovered from history.");
recoverVarselStatusFromHist(varselHistory);
log.info("----- Done -----");
} else {
log.warn("No variables auto filter history is found.");
}
} else {
// sync to make sure load from hdfs config is consistent with local configuration
syncDataToHdfs(super.modelConfig.getDataSet().getSource());
String filterExpressions = super.modelConfig.getSegmentFilterExpressionsAsString();
Environment.getProperties().put("shifu.segment.expressions", filterExpressions);
if (StringUtils.isNotBlank(filterExpressions)) {
String[] splits = CommonUtils.split(filterExpressions, Constants.SHIFU_STATS_FILTER_EXPRESSIONS_DELIMETER);
for (int i = 0; i < super.columnConfigList.size(); i++) {
ColumnConfig config = super.columnConfigList.get(i);
int rawSize = super.columnConfigList.size() / (1 + splits.length);
if (config.isTarget()) {
for (int j = 0; j < splits.length; j++) {
ColumnConfig otherConfig = super.columnConfigList.get((j + 1) * rawSize + i);
otherConfig.setColumnFlag(ColumnFlag.ForceRemove);
otherConfig.setFinalSelect(false);
}
break;
}
}
this.saveColumnConfigList();
// sync to make sure load from hdfs config is consistent with local configuration
syncDataToHdfs(super.modelConfig.getDataSet().getSource());
}
if (modelConfig.isRegression()) {
String filterBy = this.modelConfig.getVarSelectFilterBy();
if (filterBy.equalsIgnoreCase(Constants.FILTER_BY_KS) || filterBy.equalsIgnoreCase(Constants.FILTER_BY_IV) || filterBy.equalsIgnoreCase(Constants.FILTER_BY_PARETO) || filterBy.equalsIgnoreCase(Constants.FILTER_BY_MIX)) {
VariableSelector selector = new VariableSelector(this.modelConfig, this.columnConfigList);
this.columnConfigList = selector.selectByFilter();
} else if (filterBy.equalsIgnoreCase(Constants.FILTER_BY_FI)) {
if (!CommonUtils.isTreeModel(modelConfig.getAlgorithm())) {
throw new IllegalArgumentException("Filter by FI only works well in GBT/RF. Please check your modelconfig::train.");
}
selectByFeatureImportance();
} else if (filterBy.equalsIgnoreCase(Constants.FILTER_BY_SE) || filterBy.equalsIgnoreCase(Constants.FILTER_BY_ST)) {
if (!Constants.NN.equalsIgnoreCase(modelConfig.getAlgorithm()) && !Constants.LR.equalsIgnoreCase(modelConfig.getAlgorithm())) {
throw new IllegalArgumentException("Filter by SE/ST only works well in NN/LR. Please check your modelconfig::train.");
}
int recursiveCnt = getRecursiveCnt();
int i = 0;
// create varsel directory and write original copy of ColumnConfig.json
ShifuFileUtils.createDirIfNotExists(pathFinder.getVarSelDir(), SourceType.LOCAL);
super.saveColumnConfigList(pathFinder.getVarSelColumnConfig(i), this.columnConfigList);
while ((i++) < recursiveCnt) {
String trainLogFile = TRAIN_LOG_PREFIX + "-" + (i - 1) + ".log";
distributedSEWrapper(trainLogFile);
// copy training log to SE train.log
ShifuFileUtils.move(trainLogFile, new File(pathFinder.getVarSelDir(), trainLogFile).getPath(), SourceType.LOCAL);
String varSelectMSEOutputPath = pathFinder.getVarSelectMSEOutputPath(modelConfig.getDataSet().getSource());
// even fail to run SE, still to create an empty se.x file
String varSelMSEHistPath = pathFinder.getVarSelMSEHistPath(i - 1);
ShifuFileUtils.createFileIfNotExists(varSelMSEHistPath, SourceType.LOCAL);
ShifuFileUtils.copyToLocal(new SourceFile(varSelectMSEOutputPath, modelConfig.getDataSet().getSource()), Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME, varSelMSEHistPath);
// save as backup
super.saveColumnConfigList(pathFinder.getVarSelColumnConfig(i), this.columnConfigList);
// save as current copy
super.saveColumnConfigList();
}
} else if (filterBy.equalsIgnoreCase(Constants.FILTER_BY_VOTED)) {
votedVariablesSelection();
}
} else {
boolean hasCandidates = CommonUtils.hasCandidateColumns(this.columnConfigList);
if (this.modelConfig.getVarSelect().getForceEnable() && CollectionUtils.isNotEmpty(this.modelConfig.getListForceSelect())) {
log.info("Force Selection is enabled ... " + "for multi-classification, currently only use it to selected variables.");
for (ColumnConfig config : this.columnConfigList) {
if (config.isForceSelect()) {
if (!CommonUtils.isGoodCandidate(config, hasCandidates, modelConfig.isRegression())) {
log.warn("!! Variable - {} is not a good candidate. But it is in forceselect list", config.getColumnName());
}
config.setFinalSelect(true);
}
}
log.info("{} variables are selected by force.", this.modelConfig.getListForceSelect().size());
} else {
// multiple classification, select all candidate at first, TODO add SE for multi-classification
for (ColumnConfig config : this.columnConfigList) {
if (CommonUtils.isGoodCandidate(config, hasCandidates, modelConfig.isRegression())) {
config.setFinalSelect(true);
}
}
}
}
// clean shadow targets for multi-segments
cleanShadowTargetsForSegments();
if (modelConfig.getVarSelect().getAutoFilterEnable()) {
runAutoVarFilter();
}
}
// save column config to file and sync to
clearUp(ModelStep.VARSELECT);
} catch (ShifuException e) {
log.error("Error:" + e.getError().toString() + "; msg:" + e.getMessage(), e);
return -1;
} catch (Exception e) {
log.error("Error:" + e.getMessage(), e);
return -1;
}
log.info("Step Finished: varselect with {} ms", (System.currentTimeMillis() - start));
return 0;
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class AkkaStatsWorker method doStats.
@Override
public boolean doStats() throws Exception {
List<Scanner> scanners = null;
try {
RawSourceData.SourceType sourceType = modelConfig.getDataSet().getSource();
// the bug is caused when merging code? please take care
scanners = ShifuFileUtils.getDataScanners(ShifuFileUtils.expandPath(modelConfig.getDataSetRawPath(), sourceType), sourceType);
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_INPUT_NOT_FOUND, e);
}
if (CollectionUtils.isEmpty(scanners)) {
throw new ShifuException(ShifuErrorCode.ERROR_INPUT_NOT_FOUND, ", please check your data and start from init");
}
log.info("Num of Scanners: " + scanners.size());
AkkaSystemExecutor.getExecutor().submitStatsCalJob(modelConfig, columnConfigList, scanners);
// release
processor.closeScanners(scanners);
return true;
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class NormalizeModelProcessor method runAkkaNormalize.
/**
* running akka normalize process
*
* @throws IOException
*/
private void runAkkaNormalize() throws IOException {
SourceType sourceType = modelConfig.getDataSet().getSource();
ShifuFileUtils.deleteFile(pathFinder.getNormalizedDataPath(), sourceType);
ShifuFileUtils.deleteFile(pathFinder.getSelectedRawDataPath(), sourceType);
List<Scanner> scanners = null;
try {
scanners = ShifuFileUtils.getDataScanners(ShifuFileUtils.expandPath(modelConfig.getDataSetRawPath(), sourceType), sourceType);
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_INPUT_NOT_FOUND, e, ", could not get input files " + modelConfig.getDataSetRawPath());
}
if (scanners == null || scanners.size() == 0) {
throw new ShifuException(ShifuErrorCode.ERROR_INPUT_NOT_FOUND, ", please check the data in " + modelConfig.getDataSetRawPath() + " in " + sourceType);
}
AkkaSystemExecutor.getExecutor().submitNormalizeJob(modelConfig, columnConfigList, scanners);
// release
closeScanners(scanners);
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class AddColumnNumAndFilterUDF method exec.
@SuppressWarnings("deprecation")
@Override
public DataBag exec(Tuple input) throws IOException {
DataBag bag = BagFactory.getInstance().newDefaultBag();
TupleFactory tupleFactory = TupleFactory.getInstance();
if (input == null) {
return null;
}
int size = input.size();
if (size == 0 || input.size() != this.columnConfigList.size()) {
log.error("the input size - " + input.size() + ", while column size - " + columnConfigList.size());
this.mismatchCnt++;
// this could make Shifu could skip some malformed data
if (this.mismatchCnt > MAX_MISMATCH_CNT) {
throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
}
return null;
}
if (input.get(tagColumnNum) == null) {
log.error("tagColumnNum is " + tagColumnNum + "; input size is " + input.size() + "; columnConfigList.size() is " + columnConfigList.size() + "; tuple is" + input.toDelimitedString("|") + "; tag is " + input.get(tagColumnNum));
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
String tag = CommonUtils.trimTag(input.get(tagColumnNum).toString());
if (this.isLinearTarget) {
if (!NumberUtils.isNumber(tag)) {
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
} else if (!super.tagSet.contains(tag)) {
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
Double rate = modelConfig.getBinningSampleRate();
if (!this.isLinearTarget && !modelConfig.isClassification() && modelConfig.isBinningSampleNegOnly()) {
if (super.negTagSet.contains(tag) && random.nextDouble() > rate) {
return null;
}
} else {
if (random.nextDouble() > rate) {
return null;
}
}
List<Boolean> filterResultList = null;
if (this.isForExpressions) {
filterResultList = new ArrayList<Boolean>();
for (int j = 0; j < this.dataPurifiers.size(); j++) {
DataPurifier dataPurifier = this.dataPurifiers.get(j);
filterResultList.add(dataPurifier.isFilter(input));
}
}
boolean isPositiveInst = (modelConfig.isRegression() && super.posTagSet.contains(tag));
for (int i = 0; i < size; i++) {
ColumnConfig config = columnConfigList.get(i);
if (!isValidRecord(modelConfig.isRegression(), isPositiveInst, config)) {
continue;
}
bag.add(buildTuple(input, tupleFactory, tag, i, i));
if (this.isForExpressions) {
for (int j = 0; j < this.dataPurifiers.size(); j++) {
Boolean isFilter = filterResultList.get(j);
if (isFilter != null && isFilter) {
bag.add(buildTuple(input, tupleFactory, tag, i, (j + 1) * size + i));
}
}
}
}
return bag;
}
Aggregations