Search in sources :

Example 1 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class StatsModelProcessor method run.

/**
 * runner for statistics
 */
@Override
public int run() throws Exception {
    log.info("Step Start: stats");
    long start = System.currentTimeMillis();
    try {
        // 0. set up and sync to HDFS
        setUp(ModelStep.STATS);
        // resync ModelConfig.json/ColumnConfig.json to HDFS
        syncDataToHdfs(modelConfig.getDataSet().getSource());
        if (getBooleanParam(this.params, Constants.IS_COMPUTE_CORR)) {
            // 1. validate if run stats before run stats -correlation
            boolean foundValidMeanValueColumn = isMeanCalculated();
            if (!foundValidMeanValueColumn) {
                log.warn("Some mean value of column is null, could you check if you run 'shifu stats'.");
                return -1;
            }
            // 2. compute correlation
            log.info("Start computing correlation value ...");
            SourceType source = this.modelConfig.getDataSet().getSource();
            String corrPath = super.getPathFinder().getCorrelationPath(source);
            // check if can start from existing output
            boolean reuseCorrResult = Environment.getBoolean("shifu.stats.corr.reuse", Boolean.FALSE);
            if (reuseCorrResult && ShifuFileUtils.isFileExists(corrPath, SourceType.HDFS)) {
                dumpAndCalculateCorrelationResult(source, corrPath);
            } else {
                runCorrMapReduceJob();
            }
            // 3. save column config list
            saveColumnConfigList();
        } else if (getBooleanParam(this.params, Constants.IS_COMPUTE_PSI)) {
            boolean foundValidMeanValueColumn = isMeanCalculated();
            if (!foundValidMeanValueColumn) {
                log.warn("Some mean value of column is null, could you check if you run 'shifu stats'.");
                return -1;
            }
            if (StringUtils.isNotEmpty(modelConfig.getPsiColumnName())) {
                new MapReducerStatsWorker(this, modelConfig, columnConfigList).runPSI();
                // save column config list after running PSI successfully
                saveColumnConfigList();
            } else {
                log.warn("To Run PSI please set your PSI column in dataSet::psiColumnName.");
            }
        } else if (getBooleanParam(this.params, Constants.IS_REBIN)) {
            // run the re-binning
            String backupColumnConfigPath = this.pathFinder.getBackupColumnConfig();
            if (!ShifuFileUtils.isFileExists(new Path(backupColumnConfigPath), SourceType.LOCAL)) {
                ShifuFileUtils.createDirIfNotExists(new SourceFile(Constants.TMP, SourceType.LOCAL));
                saveColumnConfigList(backupColumnConfigPath, this.columnConfigList);
            } else {
                // existing backup ColumnConfig.json, use binning info in it to do rebin
                List<ColumnConfig> backColumnConfigList = CommonUtils.loadColumnConfigList(backupColumnConfigPath, SourceType.LOCAL, false);
                for (ColumnConfig backupColumnConfig : backColumnConfigList) {
                    for (ColumnConfig columnConfig : this.columnConfigList) {
                        if (NSColumnUtils.isColumnEqual(backupColumnConfig.getColumnName(), columnConfig.getColumnName())) {
                            columnConfig.setColumnBinning(backupColumnConfig.getColumnBinning());
                        }
                    }
                }
            }
            // user provide candidate variable list or not
            boolean hasCandidates = CommonUtils.hasCandidateColumns(this.columnConfigList);
            List<ColumnConfig> rebinColumns = new ArrayList<ColumnConfig>();
            List<String> catVariables = getStringList(this.params, Constants.REQUEST_VARS, ",");
            for (ColumnConfig columnConfig : this.columnConfigList) {
                if (CollectionUtils.isEmpty(catVariables) || isRequestColumn(catVariables, columnConfig)) {
                    if (CommonUtils.isGoodCandidate(columnConfig, hasCandidates)) {
                        rebinColumns.add(columnConfig);
                    } else {
                        log.warn("Column - {} is not a good candidate. Skip it.", columnConfig.getColumnName());
                    }
                }
            }
            if (CollectionUtils.isNotEmpty(rebinColumns)) {
                for (ColumnConfig columnConfig : rebinColumns) {
                    doReBin(columnConfig);
                }
            }
            // use the merge ColumnConfig.json to replace current one
            saveColumnConfigList();
        } else {
            AbstractStatsExecutor statsExecutor = null;
            if (modelConfig.isMapReduceRunMode()) {
                if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.DynamicBinning)) {
                    statsExecutor = new DIBStatsExecutor(this, modelConfig, columnConfigList);
                } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPat)) {
                    statsExecutor = new MunroPatStatsExecutor(this, modelConfig, columnConfigList);
                } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPatI)) {
                    statsExecutor = new MunroPatIStatsExecutor(this, modelConfig, columnConfigList);
                } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDT)) {
                    statsExecutor = new SPDTStatsExecutor(this, modelConfig, columnConfigList);
                } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDTI)) {
                    statsExecutor = new SPDTIStatsExecutor(this, modelConfig, columnConfigList);
                } else {
                    statsExecutor = new SPDTIStatsExecutor(this, modelConfig, columnConfigList);
                }
            } else if (modelConfig.isLocalRunMode()) {
                statsExecutor = new AkkaStatsWorker(this, modelConfig, columnConfigList);
            } else {
                throw new ShifuException(ShifuErrorCode.ERROR_UNSUPPORT_MODE);
            }
            statsExecutor.doStats();
            // update the backup ColumnConfig.json after running stats
            String backupColumnConfigPath = this.pathFinder.getBackupColumnConfig();
            ShifuFileUtils.createDirIfNotExists(new SourceFile(Constants.TMP, SourceType.LOCAL));
            saveColumnConfigList(backupColumnConfigPath, this.columnConfigList);
        }
        // back up current column config each time as stats will always change CC.json
        this.backupCurrentColumnConfigToLocal(SDF.format(new Date()));
        syncDataToHdfs(modelConfig.getDataSet().getSource());
        clearUp(ModelStep.STATS);
    } catch (ShifuException e) {
        log.error("Error:" + e.getError().toString() + "; msg:" + e.getMessage(), e);
        return -1;
    } catch (Exception e) {
        log.error("Error:" + e.getMessage(), e);
        return -1;
    }
    log.info("Step Finished: stats with {} ms", (System.currentTimeMillis() - start));
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) MapReducerStatsWorker(ml.shifu.shifu.core.processor.stats.MapReducerStatsWorker) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) ArrayList(java.util.ArrayList) AbstractStatsExecutor(ml.shifu.shifu.core.processor.stats.AbstractStatsExecutor) AkkaStatsWorker(ml.shifu.shifu.core.processor.stats.AkkaStatsWorker) Date(java.util.Date) ShifuException(ml.shifu.shifu.exception.ShifuException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) JexlException(org.apache.commons.jexl2.JexlException) IOException(java.io.IOException) DIBStatsExecutor(ml.shifu.shifu.core.processor.stats.DIBStatsExecutor) MunroPatIStatsExecutor(ml.shifu.shifu.core.processor.stats.MunroPatIStatsExecutor) SPDTIStatsExecutor(ml.shifu.shifu.core.processor.stats.SPDTIStatsExecutor) SPDTStatsExecutor(ml.shifu.shifu.core.processor.stats.SPDTStatsExecutor) MunroPatStatsExecutor(ml.shifu.shifu.core.processor.stats.MunroPatStatsExecutor) SourceFile(ml.shifu.shifu.fs.SourceFile) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 2 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class VarSelectModelProcessor method run.

/**
 * Run for the variable selection
 */
@Override
public int run() throws Exception {
    log.info("Step Start: varselect");
    long start = System.currentTimeMillis();
    try {
        setUp(ModelStep.VARSELECT);
        validateParameters();
        // reset all selections if user specify or select by absolute number
        if (getIsToReset()) {
            log.info("Reset all selections data including type final select etc!");
            resetAllFinalSelect();
        } else if (getIsToList()) {
            log.info("Below variables are selected - ");
            for (ColumnConfig columnConfig : this.columnConfigList) {
                if (columnConfig.isFinalSelect()) {
                    log.info(columnConfig.getColumnName());
                }
            }
            log.info("-----  Done -----");
        } else if (getIsToAutoFilter()) {
            log.info("Start to run variable auto filter.");
            runAutoVarFilter();
            log.info("-----  Done -----");
        } else if (getIsRecoverAuto()) {
            String varselHistory = pathFinder.getVarSelHistory();
            if (ShifuFileUtils.isFileExists(varselHistory, SourceType.LOCAL)) {
                log.info("!!! Auto filtered variables will be recovered from history.");
                recoverVarselStatusFromHist(varselHistory);
                log.info("-----  Done -----");
            } else {
                log.warn("No variables auto filter history is found.");
            }
        } else {
            // sync to make sure load from hdfs config is consistent with local configuration
            syncDataToHdfs(super.modelConfig.getDataSet().getSource());
            String filterExpressions = super.modelConfig.getSegmentFilterExpressionsAsString();
            Environment.getProperties().put("shifu.segment.expressions", filterExpressions);
            if (StringUtils.isNotBlank(filterExpressions)) {
                String[] splits = CommonUtils.split(filterExpressions, Constants.SHIFU_STATS_FILTER_EXPRESSIONS_DELIMETER);
                for (int i = 0; i < super.columnConfigList.size(); i++) {
                    ColumnConfig config = super.columnConfigList.get(i);
                    int rawSize = super.columnConfigList.size() / (1 + splits.length);
                    if (config.isTarget()) {
                        for (int j = 0; j < splits.length; j++) {
                            ColumnConfig otherConfig = super.columnConfigList.get((j + 1) * rawSize + i);
                            otherConfig.setColumnFlag(ColumnFlag.ForceRemove);
                            otherConfig.setFinalSelect(false);
                        }
                        break;
                    }
                }
                this.saveColumnConfigList();
                // sync to make sure load from hdfs config is consistent with local configuration
                syncDataToHdfs(super.modelConfig.getDataSet().getSource());
            }
            if (modelConfig.isRegression()) {
                String filterBy = this.modelConfig.getVarSelectFilterBy();
                if (filterBy.equalsIgnoreCase(Constants.FILTER_BY_KS) || filterBy.equalsIgnoreCase(Constants.FILTER_BY_IV) || filterBy.equalsIgnoreCase(Constants.FILTER_BY_PARETO) || filterBy.equalsIgnoreCase(Constants.FILTER_BY_MIX)) {
                    VariableSelector selector = new VariableSelector(this.modelConfig, this.columnConfigList);
                    this.columnConfigList = selector.selectByFilter();
                } else if (filterBy.equalsIgnoreCase(Constants.FILTER_BY_FI)) {
                    if (!CommonUtils.isTreeModel(modelConfig.getAlgorithm())) {
                        throw new IllegalArgumentException("Filter by FI only works well in GBT/RF. Please check your modelconfig::train.");
                    }
                    selectByFeatureImportance();
                } else if (filterBy.equalsIgnoreCase(Constants.FILTER_BY_SE) || filterBy.equalsIgnoreCase(Constants.FILTER_BY_ST)) {
                    if (!Constants.NN.equalsIgnoreCase(modelConfig.getAlgorithm()) && !Constants.LR.equalsIgnoreCase(modelConfig.getAlgorithm())) {
                        throw new IllegalArgumentException("Filter by SE/ST only works well in NN/LR. Please check your modelconfig::train.");
                    }
                    int recursiveCnt = getRecursiveCnt();
                    int i = 0;
                    // create varsel directory and write original copy of ColumnConfig.json
                    ShifuFileUtils.createDirIfNotExists(pathFinder.getVarSelDir(), SourceType.LOCAL);
                    super.saveColumnConfigList(pathFinder.getVarSelColumnConfig(i), this.columnConfigList);
                    while ((i++) < recursiveCnt) {
                        String trainLogFile = TRAIN_LOG_PREFIX + "-" + (i - 1) + ".log";
                        distributedSEWrapper(trainLogFile);
                        // copy training log to SE train.log
                        ShifuFileUtils.move(trainLogFile, new File(pathFinder.getVarSelDir(), trainLogFile).getPath(), SourceType.LOCAL);
                        String varSelectMSEOutputPath = pathFinder.getVarSelectMSEOutputPath(modelConfig.getDataSet().getSource());
                        // even fail to run SE, still to create an empty se.x file
                        String varSelMSEHistPath = pathFinder.getVarSelMSEHistPath(i - 1);
                        ShifuFileUtils.createFileIfNotExists(varSelMSEHistPath, SourceType.LOCAL);
                        ShifuFileUtils.copyToLocal(new SourceFile(varSelectMSEOutputPath, modelConfig.getDataSet().getSource()), Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME, varSelMSEHistPath);
                        // save as backup
                        super.saveColumnConfigList(pathFinder.getVarSelColumnConfig(i), this.columnConfigList);
                        // save as current copy
                        super.saveColumnConfigList();
                    }
                } else if (filterBy.equalsIgnoreCase(Constants.FILTER_BY_VOTED)) {
                    votedVariablesSelection();
                }
            } else {
                boolean hasCandidates = CommonUtils.hasCandidateColumns(this.columnConfigList);
                if (this.modelConfig.getVarSelect().getForceEnable() && CollectionUtils.isNotEmpty(this.modelConfig.getListForceSelect())) {
                    log.info("Force Selection is enabled ... " + "for multi-classification, currently only use it to selected variables.");
                    for (ColumnConfig config : this.columnConfigList) {
                        if (config.isForceSelect()) {
                            if (!CommonUtils.isGoodCandidate(config, hasCandidates, modelConfig.isRegression())) {
                                log.warn("!! Variable - {} is not a good candidate. But it is in forceselect list", config.getColumnName());
                            }
                            config.setFinalSelect(true);
                        }
                    }
                    log.info("{} variables are selected by force.", this.modelConfig.getListForceSelect().size());
                } else {
                    // multiple classification, select all candidate at first, TODO add SE for multi-classification
                    for (ColumnConfig config : this.columnConfigList) {
                        if (CommonUtils.isGoodCandidate(config, hasCandidates, modelConfig.isRegression())) {
                            config.setFinalSelect(true);
                        }
                    }
                }
            }
            // clean shadow targets for multi-segments
            cleanShadowTargetsForSegments();
            if (modelConfig.getVarSelect().getAutoFilterEnable()) {
                runAutoVarFilter();
            }
        }
        // save column config to file and sync to
        clearUp(ModelStep.VARSELECT);
    } catch (ShifuException e) {
        log.error("Error:" + e.getError().toString() + "; msg:" + e.getMessage(), e);
        return -1;
    } catch (Exception e) {
        log.error("Error:" + e.getMessage(), e);
        return -1;
    }
    log.info("Step Finished: varselect with {} ms", (System.currentTimeMillis() - start));
    return 0;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) VariableSelector(ml.shifu.shifu.core.VariableSelector) SourceFile(ml.shifu.shifu.fs.SourceFile) SourceFile(ml.shifu.shifu.fs.SourceFile) File(java.io.File) ShifuException(ml.shifu.shifu.exception.ShifuException) ShifuException(ml.shifu.shifu.exception.ShifuException) JexlException(org.apache.commons.jexl2.JexlException) IOException(java.io.IOException)

Example 3 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class AkkaStatsWorker method doStats.

@Override
public boolean doStats() throws Exception {
    List<Scanner> scanners = null;
    try {
        RawSourceData.SourceType sourceType = modelConfig.getDataSet().getSource();
        // the bug is caused when merging code? please take care
        scanners = ShifuFileUtils.getDataScanners(ShifuFileUtils.expandPath(modelConfig.getDataSetRawPath(), sourceType), sourceType);
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_INPUT_NOT_FOUND, e);
    }
    if (CollectionUtils.isEmpty(scanners)) {
        throw new ShifuException(ShifuErrorCode.ERROR_INPUT_NOT_FOUND, ", please check your data and start from init");
    }
    log.info("Num of Scanners: " + scanners.size());
    AkkaSystemExecutor.getExecutor().submitStatsCalJob(modelConfig, columnConfigList, scanners);
    // release
    processor.closeScanners(scanners);
    return true;
}
Also used : Scanner(java.util.Scanner) IOException(java.io.IOException) RawSourceData(ml.shifu.shifu.container.obj.RawSourceData) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 4 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class NormalizeModelProcessor method runAkkaNormalize.

/**
 * running akka normalize process
 *
 * @throws IOException
 */
private void runAkkaNormalize() throws IOException {
    SourceType sourceType = modelConfig.getDataSet().getSource();
    ShifuFileUtils.deleteFile(pathFinder.getNormalizedDataPath(), sourceType);
    ShifuFileUtils.deleteFile(pathFinder.getSelectedRawDataPath(), sourceType);
    List<Scanner> scanners = null;
    try {
        scanners = ShifuFileUtils.getDataScanners(ShifuFileUtils.expandPath(modelConfig.getDataSetRawPath(), sourceType), sourceType);
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_INPUT_NOT_FOUND, e, ", could not get input files " + modelConfig.getDataSetRawPath());
    }
    if (scanners == null || scanners.size() == 0) {
        throw new ShifuException(ShifuErrorCode.ERROR_INPUT_NOT_FOUND, ", please check the data in " + modelConfig.getDataSetRawPath() + " in " + sourceType);
    }
    AkkaSystemExecutor.getExecutor().submitNormalizeJob(modelConfig, columnConfigList, scanners);
    // release
    closeScanners(scanners);
}
Also used : Scanner(java.util.Scanner) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 5 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class AddColumnNumAndFilterUDF method exec.

@SuppressWarnings("deprecation")
@Override
public DataBag exec(Tuple input) throws IOException {
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    TupleFactory tupleFactory = TupleFactory.getInstance();
    if (input == null) {
        return null;
    }
    int size = input.size();
    if (size == 0 || input.size() != this.columnConfigList.size()) {
        log.error("the input size - " + input.size() + ", while column size - " + columnConfigList.size());
        this.mismatchCnt++;
        // this could make Shifu could skip some malformed data
        if (this.mismatchCnt > MAX_MISMATCH_CNT) {
            throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
        }
        return null;
    }
    if (input.get(tagColumnNum) == null) {
        log.error("tagColumnNum is " + tagColumnNum + "; input size is " + input.size() + "; columnConfigList.size() is " + columnConfigList.size() + "; tuple is" + input.toDelimitedString("|") + "; tag is " + input.get(tagColumnNum));
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    String tag = CommonUtils.trimTag(input.get(tagColumnNum).toString());
    if (this.isLinearTarget) {
        if (!NumberUtils.isNumber(tag)) {
            if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
                PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
            }
            return null;
        }
    } else if (!super.tagSet.contains(tag)) {
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    Double rate = modelConfig.getBinningSampleRate();
    if (!this.isLinearTarget && !modelConfig.isClassification() && modelConfig.isBinningSampleNegOnly()) {
        if (super.negTagSet.contains(tag) && random.nextDouble() > rate) {
            return null;
        }
    } else {
        if (random.nextDouble() > rate) {
            return null;
        }
    }
    List<Boolean> filterResultList = null;
    if (this.isForExpressions) {
        filterResultList = new ArrayList<Boolean>();
        for (int j = 0; j < this.dataPurifiers.size(); j++) {
            DataPurifier dataPurifier = this.dataPurifiers.get(j);
            filterResultList.add(dataPurifier.isFilter(input));
        }
    }
    boolean isPositiveInst = (modelConfig.isRegression() && super.posTagSet.contains(tag));
    for (int i = 0; i < size; i++) {
        ColumnConfig config = columnConfigList.get(i);
        if (!isValidRecord(modelConfig.isRegression(), isPositiveInst, config)) {
            continue;
        }
        bag.add(buildTuple(input, tupleFactory, tag, i, i));
        if (this.isForExpressions) {
            for (int j = 0; j < this.dataPurifiers.size(); j++) {
                Boolean isFilter = filterResultList.get(j);
                if (isFilter != null && isFilter) {
                    bag.add(buildTuple(input, tupleFactory, tag, i, (j + 1) * size + i));
                }
            }
        }
    }
    return bag;
}
Also used : DataPurifier(ml.shifu.shifu.core.DataPurifier) DataBag(org.apache.pig.data.DataBag) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) TupleFactory(org.apache.pig.data.TupleFactory) ShifuException(ml.shifu.shifu.exception.ShifuException)

Aggregations

ShifuException (ml.shifu.shifu.exception.ShifuException)39 IOException (java.io.IOException)22 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)12 HashMap (java.util.HashMap)8 ArrayList (java.util.ArrayList)5 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)5 File (java.io.File)4 Scanner (java.util.Scanner)4 Path (org.apache.hadoop.fs.Path)4 SourceFile (ml.shifu.shifu.fs.SourceFile)3 JobStats (org.apache.pig.tools.pigstats.JobStats)3 BufferedReader (java.io.BufferedReader)2 ConfusionMatrixObject (ml.shifu.shifu.container.ConfusionMatrixObject)2 EvalConfig (ml.shifu.shifu.container.obj.EvalConfig)2 RawSourceData (ml.shifu.shifu.container.obj.RawSourceData)2 AbstractStatsExecutor (ml.shifu.shifu.core.processor.stats.AbstractStatsExecutor)2 AkkaStatsWorker (ml.shifu.shifu.core.processor.stats.AkkaStatsWorker)2 DIBStatsExecutor (ml.shifu.shifu.core.processor.stats.DIBStatsExecutor)2 MunroPatIStatsExecutor (ml.shifu.shifu.core.processor.stats.MunroPatIStatsExecutor)2 MunroPatStatsExecutor (ml.shifu.shifu.core.processor.stats.MunroPatStatsExecutor)2