Search in sources :

Example 1 with DIBStatsExecutor

use of ml.shifu.shifu.core.processor.stats.DIBStatsExecutor in project shifu by ShifuML.

the class StatsModelProcessor method run.

/**
 * runner for statistics
 */
@Override
public int run() throws Exception {
    log.info("Step Start: stats");
    long start = System.currentTimeMillis();
    try {
        // 0. set up and sync to HDFS
        setUp(ModelStep.STATS);
        // resync ModelConfig.json/ColumnConfig.json to HDFS
        syncDataToHdfs(modelConfig.getDataSet().getSource());
        if (getBooleanParam(this.params, Constants.IS_COMPUTE_CORR)) {
            // 1. validate if run stats before run stats -correlation
            boolean foundValidMeanValueColumn = isMeanCalculated();
            if (!foundValidMeanValueColumn) {
                log.warn("Some mean value of column is null, could you check if you run 'shifu stats'.");
                return -1;
            }
            // 2. compute correlation
            log.info("Start computing correlation value ...");
            SourceType source = this.modelConfig.getDataSet().getSource();
            String corrPath = super.getPathFinder().getCorrelationPath(source);
            // check if can start from existing output
            boolean reuseCorrResult = Environment.getBoolean("shifu.stats.corr.reuse", Boolean.FALSE);
            if (reuseCorrResult && ShifuFileUtils.isFileExists(corrPath, SourceType.HDFS)) {
                dumpAndCalculateCorrelationResult(source, corrPath);
            } else {
                runCorrMapReduceJob();
            }
            // 3. save column config list
            saveColumnConfigList();
        } else if (getBooleanParam(this.params, Constants.IS_COMPUTE_PSI)) {
            boolean foundValidMeanValueColumn = isMeanCalculated();
            if (!foundValidMeanValueColumn) {
                log.warn("Some mean value of column is null, could you check if you run 'shifu stats'.");
                return -1;
            }
            if (StringUtils.isNotEmpty(modelConfig.getPsiColumnName())) {
                new MapReducerStatsWorker(this, modelConfig, columnConfigList).runPSI();
                // save column config list after running PSI successfully
                saveColumnConfigList();
            } else {
                log.warn("To Run PSI please set your PSI column in dataSet::psiColumnName.");
            }
        } else if (getBooleanParam(this.params, Constants.IS_REBIN)) {
            // run the re-binning
            String backupColumnConfigPath = this.pathFinder.getBackupColumnConfig();
            if (!ShifuFileUtils.isFileExists(new Path(backupColumnConfigPath), SourceType.LOCAL)) {
                ShifuFileUtils.createDirIfNotExists(new SourceFile(Constants.TMP, SourceType.LOCAL));
                saveColumnConfigList(backupColumnConfigPath, this.columnConfigList);
            } else {
                // existing backup ColumnConfig.json, use binning info in it to do rebin
                List<ColumnConfig> backColumnConfigList = CommonUtils.loadColumnConfigList(backupColumnConfigPath, SourceType.LOCAL, false);
                for (ColumnConfig backupColumnConfig : backColumnConfigList) {
                    for (ColumnConfig columnConfig : this.columnConfigList) {
                        if (NSColumnUtils.isColumnEqual(backupColumnConfig.getColumnName(), columnConfig.getColumnName())) {
                            columnConfig.setColumnBinning(backupColumnConfig.getColumnBinning());
                        }
                    }
                }
            }
            // user provide candidate variable list or not
            boolean hasCandidates = CommonUtils.hasCandidateColumns(this.columnConfigList);
            List<ColumnConfig> rebinColumns = new ArrayList<ColumnConfig>();
            List<String> catVariables = getStringList(this.params, Constants.REQUEST_VARS, ",");
            for (ColumnConfig columnConfig : this.columnConfigList) {
                if (CollectionUtils.isEmpty(catVariables) || isRequestColumn(catVariables, columnConfig)) {
                    if (CommonUtils.isGoodCandidate(columnConfig, hasCandidates)) {
                        rebinColumns.add(columnConfig);
                    } else {
                        log.warn("Column - {} is not a good candidate. Skip it.", columnConfig.getColumnName());
                    }
                }
            }
            if (CollectionUtils.isNotEmpty(rebinColumns)) {
                for (ColumnConfig columnConfig : rebinColumns) {
                    doReBin(columnConfig);
                }
            }
            // use the merge ColumnConfig.json to replace current one
            saveColumnConfigList();
        } else {
            AbstractStatsExecutor statsExecutor = null;
            if (modelConfig.isMapReduceRunMode()) {
                if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.DynamicBinning)) {
                    statsExecutor = new DIBStatsExecutor(this, modelConfig, columnConfigList);
                } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPat)) {
                    statsExecutor = new MunroPatStatsExecutor(this, modelConfig, columnConfigList);
                } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPatI)) {
                    statsExecutor = new MunroPatIStatsExecutor(this, modelConfig, columnConfigList);
                } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDT)) {
                    statsExecutor = new SPDTStatsExecutor(this, modelConfig, columnConfigList);
                } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDTI)) {
                    statsExecutor = new SPDTIStatsExecutor(this, modelConfig, columnConfigList);
                } else {
                    statsExecutor = new SPDTIStatsExecutor(this, modelConfig, columnConfigList);
                }
            } else if (modelConfig.isLocalRunMode()) {
                statsExecutor = new AkkaStatsWorker(this, modelConfig, columnConfigList);
            } else {
                throw new ShifuException(ShifuErrorCode.ERROR_UNSUPPORT_MODE);
            }
            statsExecutor.doStats();
            // update the backup ColumnConfig.json after running stats
            String backupColumnConfigPath = this.pathFinder.getBackupColumnConfig();
            ShifuFileUtils.createDirIfNotExists(new SourceFile(Constants.TMP, SourceType.LOCAL));
            saveColumnConfigList(backupColumnConfigPath, this.columnConfigList);
        }
        // back up current column config each time as stats will always change CC.json
        this.backupCurrentColumnConfigToLocal(SDF.format(new Date()));
        syncDataToHdfs(modelConfig.getDataSet().getSource());
        clearUp(ModelStep.STATS);
    } catch (ShifuException e) {
        log.error("Error:" + e.getError().toString() + "; msg:" + e.getMessage(), e);
        return -1;
    } catch (Exception e) {
        log.error("Error:" + e.getMessage(), e);
        return -1;
    }
    log.info("Step Finished: stats with {} ms", (System.currentTimeMillis() - start));
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) MapReducerStatsWorker(ml.shifu.shifu.core.processor.stats.MapReducerStatsWorker) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) ArrayList(java.util.ArrayList) AbstractStatsExecutor(ml.shifu.shifu.core.processor.stats.AbstractStatsExecutor) AkkaStatsWorker(ml.shifu.shifu.core.processor.stats.AkkaStatsWorker) Date(java.util.Date) ShifuException(ml.shifu.shifu.exception.ShifuException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) JexlException(org.apache.commons.jexl2.JexlException) IOException(java.io.IOException) DIBStatsExecutor(ml.shifu.shifu.core.processor.stats.DIBStatsExecutor) MunroPatIStatsExecutor(ml.shifu.shifu.core.processor.stats.MunroPatIStatsExecutor) SPDTIStatsExecutor(ml.shifu.shifu.core.processor.stats.SPDTIStatsExecutor) SPDTStatsExecutor(ml.shifu.shifu.core.processor.stats.SPDTStatsExecutor) MunroPatStatsExecutor(ml.shifu.shifu.core.processor.stats.MunroPatStatsExecutor) SourceFile(ml.shifu.shifu.fs.SourceFile) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 2 with DIBStatsExecutor

use of ml.shifu.shifu.core.processor.stats.DIBStatsExecutor in project shifu by ShifuML.

the class StatsStep method process.

/*
     * (non-Javadoc)
     * 
     * @see ml.shifu.common.Step#process()
     */
@Override
public List<ColumnConfig> process() throws IOException {
    LOG.info("Step Start: stats");
    long start = System.currentTimeMillis();
    try {
        // User may change variable type after `shifu init`
        ColumnConfigUpdater.updateColumnConfigFlags(this.modelConfig, this.columnConfigList, ModelStep.STATS);
        LOG.info("Saving ModelConfig, ColumnConfig and then upload to HDFS ...");
        JSONUtils.writeValue(new File(pathFinder.getModelConfigPath(SourceType.LOCAL)), modelConfig);
        JSONUtils.writeValue(new File(pathFinder.getColumnConfigPath(SourceType.LOCAL)), columnConfigList);
        if (SourceType.HDFS.equals(modelConfig.getDataSet().getSource())) {
            CommonUtils.copyConfFromLocalToHDFS(modelConfig, this.pathFinder);
        }
        AbstractStatsExecutor statsExecutor = null;
        if (modelConfig.isMapReduceRunMode()) {
            if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.DynamicBinning)) {
                statsExecutor = new DIBStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPat)) {
                statsExecutor = new MunroPatStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPatI)) {
                statsExecutor = new MunroPatIStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDT)) {
                statsExecutor = new SPDTStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDTI)) {
                statsExecutor = new SPDTIStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            } else {
                statsExecutor = new SPDTIStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            }
        } else if (modelConfig.isLocalRunMode()) {
            statsExecutor = new AkkaStatsWorker(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
        } else {
            throw new ShifuException(ShifuErrorCode.ERROR_UNSUPPORT_MODE);
        }
        statsExecutor.doStats();
        if (SourceType.HDFS.equals(modelConfig.getDataSet().getSource())) {
            CommonUtils.copyConfFromLocalToHDFS(modelConfig, this.pathFinder);
        }
    } catch (Exception e) {
        LOG.error("Error:", e);
    }
    LOG.info("Step Finished: stats with {} ms", (System.currentTimeMillis() - start));
    return columnConfigList;
}
Also used : DIBStatsExecutor(ml.shifu.shifu.core.processor.stats.DIBStatsExecutor) MunroPatIStatsExecutor(ml.shifu.shifu.core.processor.stats.MunroPatIStatsExecutor) SPDTIStatsExecutor(ml.shifu.shifu.core.processor.stats.SPDTIStatsExecutor) BasicModelProcessor(ml.shifu.shifu.core.processor.BasicModelProcessor) SPDTStatsExecutor(ml.shifu.shifu.core.processor.stats.SPDTStatsExecutor) AbstractStatsExecutor(ml.shifu.shifu.core.processor.stats.AbstractStatsExecutor) MunroPatStatsExecutor(ml.shifu.shifu.core.processor.stats.MunroPatStatsExecutor) AkkaStatsWorker(ml.shifu.shifu.core.processor.stats.AkkaStatsWorker) File(java.io.File) ShifuException(ml.shifu.shifu.exception.ShifuException) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Aggregations

IOException (java.io.IOException)2 AbstractStatsExecutor (ml.shifu.shifu.core.processor.stats.AbstractStatsExecutor)2 AkkaStatsWorker (ml.shifu.shifu.core.processor.stats.AkkaStatsWorker)2 DIBStatsExecutor (ml.shifu.shifu.core.processor.stats.DIBStatsExecutor)2 MunroPatIStatsExecutor (ml.shifu.shifu.core.processor.stats.MunroPatIStatsExecutor)2 MunroPatStatsExecutor (ml.shifu.shifu.core.processor.stats.MunroPatStatsExecutor)2 SPDTIStatsExecutor (ml.shifu.shifu.core.processor.stats.SPDTIStatsExecutor)2 SPDTStatsExecutor (ml.shifu.shifu.core.processor.stats.SPDTStatsExecutor)2 ShifuException (ml.shifu.shifu.exception.ShifuException)2 File (java.io.File)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)1 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)1 BasicModelProcessor (ml.shifu.shifu.core.processor.BasicModelProcessor)1 MapReducerStatsWorker (ml.shifu.shifu.core.processor.stats.MapReducerStatsWorker)1 SourceFile (ml.shifu.shifu.fs.SourceFile)1 JexlException (org.apache.commons.jexl2.JexlException)1 Path (org.apache.hadoop.fs.Path)1