Search in sources :

Example 1 with ModelSpec

use of ml.shifu.shifu.core.model.ModelSpec in project shifu by ShifuML.

the class EvalModelProcessor method validateEvalColumnConfig.

@SuppressWarnings("deprecation")
private void validateEvalColumnConfig(EvalConfig evalConfig) throws IOException {
    if (this.columnConfigList == null) {
        return;
    }
    String[] evalColumnNames = null;
    if (StringUtils.isNotBlank(evalConfig.getDataSet().getHeaderPath())) {
        String delimiter = // get header delimiter
        StringUtils.isBlank(evalConfig.getDataSet().getHeaderDelimiter()) ? evalConfig.getDataSet().getDataDelimiter() : evalConfig.getDataSet().getHeaderDelimiter();
        evalColumnNames = CommonUtils.getHeaders(evalConfig.getDataSet().getHeaderPath(), delimiter, evalConfig.getDataSet().getSource());
    } else {
        String delimiter = // get header delimiter
        StringUtils.isBlank(evalConfig.getDataSet().getHeaderDelimiter()) ? evalConfig.getDataSet().getDataDelimiter() : evalConfig.getDataSet().getHeaderDelimiter();
        String[] fields = CommonUtils.takeFirstLine(evalConfig.getDataSet().getDataPath(), delimiter, evalConfig.getDataSet().getSource());
        // if first line contains target column name, we guess it is csv format and first line is header.
        String evalTargetColumnName = ((StringUtils.isBlank(evalConfig.getDataSet().getTargetColumnName())) ? modelConfig.getTargetColumnName() : evalConfig.getDataSet().getTargetColumnName());
        if (StringUtils.join(fields, "").contains(evalTargetColumnName)) {
            // first line of data meaning second line in data files excluding first header line
            String[] dataInFirstLine = CommonUtils.takeFirstTwoLines(evalConfig.getDataSet().getDataPath(), delimiter, evalConfig.getDataSet().getSource())[1];
            if (dataInFirstLine != null && fields.length != dataInFirstLine.length) {
                throw new IllegalArgumentException("Eval header length and eval data length are not consistent, please check you header setting and data set setting in eval.");
            }
            // char or / in its name in shifu will be replaced;
            for (int i = 0; i < fields.length; i++) {
                fields[i] = CommonUtils.normColumnName(fields[i]);
            }
            evalColumnNames = fields;
            // for(int i = 0; i < fields.length; i++) {
            // evalColumnNames[i] = CommonUtils.getRelativePigHeaderColumnName(fields[i]);
            // }
            LOG.warn("No header path is provided, we will try to read first line and detect schema.");
            LOG.warn("Schema in ColumnConfig.json are named as first line of data set path.");
        } else {
            LOG.warn("No header path is provided, we will try to read first line and detect schema.");
            LOG.warn("Schema in ColumnConfig.json are named as  index 0, 1, 2, 3 ...");
            LOG.warn("Please make sure weight column and tag column are also taking index as name.");
            evalColumnNames = new String[fields.length];
            for (int i = 0; i < fields.length; i++) {
                evalColumnNames[i] = i + "";
            }
        }
    }
    Set<NSColumn> names = new HashSet<NSColumn>();
    for (String evalColumnName : evalColumnNames) {
        names.add(new NSColumn(evalColumnName));
    }
    String filterExpressions = super.modelConfig.getSegmentFilterExpressionsAsString();
    if (StringUtils.isNotBlank(filterExpressions)) {
        int segFilterSize = CommonUtils.split(filterExpressions, Constants.SHIFU_STATS_FILTER_EXPRESSIONS_DELIMETER).length;
        for (int i = 0; i < segFilterSize; i++) {
            for (int j = 0; j < evalColumnNames.length; j++) {
                names.add(new NSColumn(evalColumnNames[j] + "_" + (i + 1)));
            }
        }
    }
    if (Constants.GENERIC.equalsIgnoreCase(modelConfig.getAlgorithm()) || Constants.TENSORFLOW.equalsIgnoreCase(modelConfig.getAlgorithm())) {
        // TODO correct this logic
        return;
    }
    List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, SourceType.LOCAL, evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
    if (CollectionUtils.isNotEmpty(models)) {
        validateFinalColumns(evalConfig, this.modelConfig.getModelSetName(), false, this.columnConfigList, names);
    }
    NSColumn targetColumn = new NSColumn(evalConfig.getDataSet().getTargetColumnName());
    if (StringUtils.isNotBlank(evalConfig.getDataSet().getTargetColumnName()) && !names.contains(targetColumn) && !names.contains(new NSColumn(targetColumn.getSimpleName()))) {
        throw new IllegalArgumentException("Target column " + evalConfig.getDataSet().getTargetColumnName() + " does not exist in - " + evalConfig.getDataSet().getHeaderPath());
    }
    NSColumn weightColumn = new NSColumn(evalConfig.getDataSet().getWeightColumnName());
    if (StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName()) && !names.contains(weightColumn) && !names.contains(new NSColumn(weightColumn.getSimpleName()))) {
        throw new IllegalArgumentException("Weight column " + evalConfig.getDataSet().getWeightColumnName() + " does not exist in - " + evalConfig.getDataSet().getHeaderPath());
    }
    List<ModelSpec> subModels = ModelSpecLoaderUtils.loadSubModels(modelConfig, this.columnConfigList, evalConfig, SourceType.LOCAL, evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
    if (CollectionUtils.isNotEmpty(subModels)) {
        for (ModelSpec modelSpec : subModels) {
            validateFinalColumns(evalConfig, modelSpec.getModelName(), true, modelSpec.getColumnConfigList(), names);
        }
    }
}
Also used : BasicML(org.encog.ml.BasicML) ModelSpec(ml.shifu.shifu.core.model.ModelSpec) NSColumn(ml.shifu.shifu.column.NSColumn) HashSet(java.util.HashSet)

Example 2 with ModelSpec

use of ml.shifu.shifu.core.model.ModelSpec in project shifu by ShifuML.

the class EvalScoreUDF method exec.

@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
    if (isCsvFormat) {
        String firstCol = ((input.get(0) == null) ? "" : input.get(0).toString());
        if (this.headers[0].equals(CommonUtils.normColumnName(firstCol))) {
            // TODO what to do if the column value == column name? ...
            return null;
        }
    }
    long start = System.currentTimeMillis();
    if (this.modelRunner == null) {
        // here to initialize modelRunner, this is moved from constructor to here to avoid OOM in client side.
        // UDF in pig client will be initialized to get some metadata issues
        List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
        this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
        List<ModelSpec> subModels = ModelSpecLoaderUtils.loadSubModels(modelConfig, this.columnConfigList, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
        if (CollectionUtils.isNotEmpty(subModels)) {
            for (ModelSpec modelSpec : subModels) {
                this.modelRunner.addSubModels(modelSpec, this.isMultiThreadScoring);
                this.subModelsCnt.put(modelSpec.getModelName(), modelSpec.getModels().size());
            }
        }
        this.modelCnt = models.size();
        // reset models in classfication case
        if (modelConfig.isClassification()) {
            if (modelConfig.getTrain().isOneVsAll()) {
                if (modelConfig.getTags().size() == 2) {
                    // onevsall, modelcnt is 1
                    this.modelCnt = 1;
                } else {
                    this.modelCnt = modelConfig.getTags().size();
                }
            } else {
                if (modelConfig.getTags().size() == 2) {
                    // native binary
                    this.modelCnt = 1;
                } else {
                    // native multiple classification model cnt is bagging num
                    this.modelCnt = (this.modelCnt >= modelConfig.getBaggingNum() ? modelConfig.getBaggingNum() : this.modelCnt);
                }
            }
            // reset models to
            models = models.subList(0, this.modelCnt);
            this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
        }
        this.modelRunner.setScoreScale(Integer.parseInt(this.scale));
        log.info("DEBUG: model cnt " + this.modelCnt + " sub models cnt " + modelRunner.getSubModelsCnt());
    }
    Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.headers, this.segFilterSize);
    if (MapUtils.isEmpty(rawDataNsMap)) {
        return null;
    }
    String tag = CommonUtils.trimTag(rawDataNsMap.get(new NSColumn(modelConfig.getTargetColumnName(evalConfig))));
    // filter invalid tag record out
    // disable the tag check, since there is no bad tag in eval data set
    // and user just want to score the data, but don't run performance evaluation
    /*
         * if(!tagSet.contains(tag)) {
         * if(System.currentTimeMillis() % 100 == 0) {
         * log.warn("Invalid tag: " + tag);
         * }
         * if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
         * PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_RECORDS)
         * .increment(1);
         * }
         * return null;
         * }
         */
    long startTime = System.nanoTime();
    CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap);
    long runInterval = (System.nanoTime() - startTime) / 1000L;
    if (cs == null) {
        if (System.currentTimeMillis() % 100 == 0) {
            log.warn("Get null result, for input: " + input.toDelimitedString("|"));
        }
        return null;
    }
    Tuple tuple = TupleFactory.getInstance().newTuple();
    tuple.append(tag);
    String weight = null;
    if (StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName())) {
        weight = rawDataNsMap.get(new NSColumn(evalConfig.getDataSet().getWeightColumnName()));
    } else {
        weight = "1.0";
    }
    incrementTagCounters(tag, weight, runInterval);
    Map<String, CaseScoreResult> subModelScores = cs.getSubModelScores();
    tuple.append(weight);
    if (this.isLinearTarget || modelConfig.isRegression()) {
        if (CollectionUtils.isNotEmpty(cs.getScores())) {
            appendModelScore(tuple, cs, true);
            if (this.outputHiddenLayerIndex != 0) {
                appendFirstHiddenOutputScore(tuple, cs.getHiddenLayerScores(), true);
            }
        }
        if (MapUtils.isNotEmpty(subModelScores)) {
            Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry<String, CaseScoreResult> entry = iterator.next();
                CaseScoreResult subCs = entry.getValue();
                appendModelScore(tuple, subCs, false);
            }
        }
    } else {
        if (CollectionUtils.isNotEmpty(cs.getScores())) {
            appendSimpleScore(tuple, cs);
            tuple.append(this.mcPredictor.predictTag(cs).getTag());
        }
        if (MapUtils.isNotEmpty(subModelScores)) {
            Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry<String, CaseScoreResult> entry = iterator.next();
                CaseScoreResult subCs = entry.getValue();
                appendSimpleScore(tuple, subCs);
            }
        }
    }
    // append meta data
    List<String> metaColumns = evalConfig.getAllMetaColumns(modelConfig);
    if (CollectionUtils.isNotEmpty(metaColumns)) {
        for (String meta : metaColumns) {
            tuple.append(rawDataNsMap.get(new NSColumn(meta)));
        }
    }
    if (System.currentTimeMillis() % 1000 == 0L) {
        log.info("running time is " + (System.currentTimeMillis() - start) + " ms.");
    }
    return tuple;
}
Also used : BasicML(org.encog.ml.BasicML) CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) Entry(java.util.Map.Entry) ModelSpec(ml.shifu.shifu.core.model.ModelSpec) Map(java.util.Map) SortedMap(java.util.SortedMap) Tuple(org.apache.pig.data.Tuple) ModelRunner(ml.shifu.shifu.core.ModelRunner) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 3 with ModelSpec

use of ml.shifu.shifu.core.model.ModelSpec in project shifu by ShifuML.

the class ModelSpecLoaderUtils method loadSubModels.

/**
 * Load sub-models under current model space
 *
 * @param modelConfig
 *            - {@link ModelConfig}, need this, since the model file may exist in HDFS
 * @param columnConfigList
 *            - List of {@link ColumnConfig}
 * @param evalConfig
 *            - {@link EvalConfig}, maybe null
 * @param sourceType
 *            - {@link SourceType}, HDFS or Local?
 * @param gbtConvertToProb
 *            - convert to probability or not for gbt model
 * @param gbtScoreConvertStrategy
 *            - gbt score conversion strategy
 * @return list of {@link ModelSpec} for sub models
 */
@SuppressWarnings("deprecation")
public static List<ModelSpec> loadSubModels(ModelConfig modelConfig, List<ColumnConfig> columnConfigList, EvalConfig evalConfig, RawSourceData.SourceType sourceType, Boolean gbtConvertToProb, String gbtScoreConvertStrategy) {
    List<ModelSpec> modelSpecs = new ArrayList<ModelSpec>();
    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    // we have to register PersistBasicFloatNetwork for loading such models
    PersistorRegistry.getInstance().add(new PersistBasicFloatNetwork());
    PathFinder pathFinder = new PathFinder(modelConfig);
    String modelsPath = null;
    if (evalConfig == null || StringUtils.isEmpty(evalConfig.getModelsPath())) {
        modelsPath = pathFinder.getModelsPath(sourceType);
    } else {
        modelsPath = evalConfig.getModelsPath();
    }
    try {
        FileStatus[] fsArr = fs.listStatus(new Path(modelsPath));
        for (FileStatus fileStatus : fsArr) {
            if (fileStatus.isDir()) {
                ModelSpec modelSpec = loadSubModelSpec(modelConfig, columnConfigList, fileStatus, sourceType, gbtConvertToProb, gbtScoreConvertStrategy);
                if (modelSpec != null) {
                    modelSpecs.add(modelSpec);
                }
            }
        }
    } catch (IOException e) {
        log.error("Error occurred when loading sub-models.", e);
    }
    return modelSpecs;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) PathFinder(ml.shifu.shifu.fs.PathFinder) ModelSpec(ml.shifu.shifu.core.model.ModelSpec) PersistBasicFloatNetwork(ml.shifu.shifu.core.dtrain.dataset.PersistBasicFloatNetwork)

Example 4 with ModelSpec

use of ml.shifu.shifu.core.model.ModelSpec in project shifu by ShifuML.

the class ModelSpecLoaderUtils method loadSubModelSpec.

/**
 * Load sub-model with FileStatus
 *
 * @param modelConfig
 *            - {@link ModelConfig}, need this, since the model file may exist in HDFS
 * @param columnConfigList
 *            - List of {@link ColumnConfig}
 * @param fileStatus
 *            - {@link EvalConfig}, maybe null
 * @param sourceType
 *            - {@link SourceType}, HDFS or Local?
 * @param gbtConvertToProb
 *            - convert to probability or not for gbt model
 * @param gbtScoreConvertStrategy
 *            - gbt score conversion strategy
 * @return {@link ModelSpec} for sub-model
 */
private static ModelSpec loadSubModelSpec(ModelConfig modelConfig, List<ColumnConfig> columnConfigList, FileStatus fileStatus, RawSourceData.SourceType sourceType, Boolean gbtConvertToProb, String gbtScoreConvertStrategy) throws IOException {
    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    String subModelName = fileStatus.getPath().getName();
    List<FileStatus> modelFileStats = new ArrayList<FileStatus>();
    FileStatus[] subConfigs = new FileStatus[2];
    ALGORITHM algorithm = getModelsAlgAndSpecFiles(fileStatus, sourceType, modelFileStats, subConfigs);
    ModelSpec modelSpec = null;
    if (CollectionUtils.isNotEmpty(modelFileStats)) {
        Collections.sort(modelFileStats, new Comparator<FileStatus>() {

            @Override
            public int compare(FileStatus fa, FileStatus fb) {
                return fa.getPath().getName().compareTo(fb.getPath().getName());
            }
        });
        List<BasicML> models = new ArrayList<BasicML>();
        for (FileStatus f : modelFileStats) {
            models.add(loadModel(modelConfig, f.getPath(), fs, gbtConvertToProb, gbtScoreConvertStrategy));
        }
        ModelConfig subModelConfig = modelConfig;
        if (subConfigs[0] != null) {
            subModelConfig = CommonUtils.loadModelConfig(subConfigs[0].getPath().toString(), sourceType);
        }
        List<ColumnConfig> subColumnConfigList = columnConfigList;
        if (subConfigs[1] != null) {
            subColumnConfigList = CommonUtils.loadColumnConfigList(subConfigs[1].getPath().toString(), sourceType);
        }
        modelSpec = new ModelSpec(subModelName, subModelConfig, subColumnConfigList, algorithm, models);
    }
    return modelSpec;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) ALGORITHM(ml.shifu.shifu.container.obj.ModelTrainConf.ALGORITHM) BasicML(org.encog.ml.BasicML) FileSystem(org.apache.hadoop.fs.FileSystem) ModelSpec(ml.shifu.shifu.core.model.ModelSpec)

Aggregations

ModelSpec (ml.shifu.shifu.core.model.ModelSpec)4 BasicML (org.encog.ml.BasicML)3 NSColumn (ml.shifu.shifu.column.NSColumn)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Entry (java.util.Map.Entry)1 SortedMap (java.util.SortedMap)1 CaseScoreResult (ml.shifu.shifu.container.CaseScoreResult)1 ALGORITHM (ml.shifu.shifu.container.obj.ModelTrainConf.ALGORITHM)1 ModelRunner (ml.shifu.shifu.core.ModelRunner)1 PersistBasicFloatNetwork (ml.shifu.shifu.core.dtrain.dataset.PersistBasicFloatNetwork)1 PathFinder (ml.shifu.shifu.fs.PathFinder)1 Path (org.apache.hadoop.fs.Path)1 Tuple (org.apache.pig.data.Tuple)1