Search in sources :

Example 1 with ModelRunner

use of ml.shifu.shifu.core.ModelRunner in project shifu by ShifuML.

the class EvalScoreUDF method exec.

@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
    if (isCsvFormat) {
        String firstCol = ((input.get(0) == null) ? "" : input.get(0).toString());
        if (this.headers[0].equals(CommonUtils.normColumnName(firstCol))) {
            // TODO what to do if the column value == column name? ...
            return null;
        }
    }
    long start = System.currentTimeMillis();
    if (this.modelRunner == null) {
        // here to initialize modelRunner, this is moved from constructor to here to avoid OOM in client side.
        // UDF in pig client will be initialized to get some metadata issues
        List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
        this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
        List<ModelSpec> subModels = ModelSpecLoaderUtils.loadSubModels(modelConfig, this.columnConfigList, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
        if (CollectionUtils.isNotEmpty(subModels)) {
            for (ModelSpec modelSpec : subModels) {
                this.modelRunner.addSubModels(modelSpec, this.isMultiThreadScoring);
                this.subModelsCnt.put(modelSpec.getModelName(), modelSpec.getModels().size());
            }
        }
        this.modelCnt = models.size();
        // reset models in classfication case
        if (modelConfig.isClassification()) {
            if (modelConfig.getTrain().isOneVsAll()) {
                if (modelConfig.getTags().size() == 2) {
                    // onevsall, modelcnt is 1
                    this.modelCnt = 1;
                } else {
                    this.modelCnt = modelConfig.getTags().size();
                }
            } else {
                if (modelConfig.getTags().size() == 2) {
                    // native binary
                    this.modelCnt = 1;
                } else {
                    // native multiple classification model cnt is bagging num
                    this.modelCnt = (this.modelCnt >= modelConfig.getBaggingNum() ? modelConfig.getBaggingNum() : this.modelCnt);
                }
            }
            // reset models to
            models = models.subList(0, this.modelCnt);
            this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
        }
        this.modelRunner.setScoreScale(Integer.parseInt(this.scale));
        log.info("DEBUG: model cnt " + this.modelCnt + " sub models cnt " + modelRunner.getSubModelsCnt());
    }
    Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.headers, this.segFilterSize);
    if (MapUtils.isEmpty(rawDataNsMap)) {
        return null;
    }
    String tag = CommonUtils.trimTag(rawDataNsMap.get(new NSColumn(modelConfig.getTargetColumnName(evalConfig))));
    // filter invalid tag record out
    // disable the tag check, since there is no bad tag in eval data set
    // and user just want to score the data, but don't run performance evaluation
    /*
         * if(!tagSet.contains(tag)) {
         * if(System.currentTimeMillis() % 100 == 0) {
         * log.warn("Invalid tag: " + tag);
         * }
         * if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
         * PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_RECORDS)
         * .increment(1);
         * }
         * return null;
         * }
         */
    long startTime = System.nanoTime();
    CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap);
    long runInterval = (System.nanoTime() - startTime) / 1000L;
    if (cs == null) {
        if (System.currentTimeMillis() % 100 == 0) {
            log.warn("Get null result, for input: " + input.toDelimitedString("|"));
        }
        return null;
    }
    Tuple tuple = TupleFactory.getInstance().newTuple();
    tuple.append(tag);
    String weight = null;
    if (StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName())) {
        weight = rawDataNsMap.get(new NSColumn(evalConfig.getDataSet().getWeightColumnName()));
    } else {
        weight = "1.0";
    }
    incrementTagCounters(tag, weight, runInterval);
    Map<String, CaseScoreResult> subModelScores = cs.getSubModelScores();
    tuple.append(weight);
    if (this.isLinearTarget || modelConfig.isRegression()) {
        if (CollectionUtils.isNotEmpty(cs.getScores())) {
            appendModelScore(tuple, cs, true);
            if (this.outputHiddenLayerIndex != 0) {
                appendFirstHiddenOutputScore(tuple, cs.getHiddenLayerScores(), true);
            }
        }
        if (MapUtils.isNotEmpty(subModelScores)) {
            Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry<String, CaseScoreResult> entry = iterator.next();
                CaseScoreResult subCs = entry.getValue();
                appendModelScore(tuple, subCs, false);
            }
        }
    } else {
        if (CollectionUtils.isNotEmpty(cs.getScores())) {
            appendSimpleScore(tuple, cs);
            tuple.append(this.mcPredictor.predictTag(cs).getTag());
        }
        if (MapUtils.isNotEmpty(subModelScores)) {
            Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry<String, CaseScoreResult> entry = iterator.next();
                CaseScoreResult subCs = entry.getValue();
                appendSimpleScore(tuple, subCs);
            }
        }
    }
    // append meta data
    List<String> metaColumns = evalConfig.getAllMetaColumns(modelConfig);
    if (CollectionUtils.isNotEmpty(metaColumns)) {
        for (String meta : metaColumns) {
            tuple.append(rawDataNsMap.get(new NSColumn(meta)));
        }
    }
    if (System.currentTimeMillis() % 1000 == 0L) {
        log.info("running time is " + (System.currentTimeMillis() - start) + " ms.");
    }
    return tuple;
}
Also used : BasicML(org.encog.ml.BasicML) CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) Entry(java.util.Map.Entry) ModelSpec(ml.shifu.shifu.core.model.ModelSpec) Map(java.util.Map) SortedMap(java.util.SortedMap) Tuple(org.apache.pig.data.Tuple) ModelRunner(ml.shifu.shifu.core.ModelRunner) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 2 with ModelRunner

use of ml.shifu.shifu.core.ModelRunner in project shifu by ShifuML.

the class PostTrainMapper method setup.

@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    loadConfigFiles(context);
    loadTagWeightNum();
    this.dataPurifier = new DataPurifier(this.modelConfig, false);
    this.outputKey = new IntWritable();
    this.outputValue = new Text();
    this.tags = new HashSet<String>(modelConfig.getFlattenTags());
    SourceType sourceType = this.modelConfig.getDataSet().getSource();
    List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, null, sourceType);
    this.headers = CommonUtils.getFinalHeaders(modelConfig);
    this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, modelConfig.getDataSetDelimiter(), models);
    this.mos = new MultipleOutputs<NullWritable, Text>((TaskInputOutputContext) context);
    this.initFeatureStats();
}
Also used : DataPurifier(ml.shifu.shifu.core.DataPurifier) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) TaskInputOutputContext(org.apache.hadoop.mapreduce.TaskInputOutputContext) Text(org.apache.hadoop.io.Text) BasicML(org.encog.ml.BasicML) NullWritable(org.apache.hadoop.io.NullWritable) IntWritable(org.apache.hadoop.io.IntWritable) ModelRunner(ml.shifu.shifu.core.ModelRunner)

Example 3 with ModelRunner

use of ml.shifu.shifu.core.ModelRunner in project shifu by ShifuML.

the class EvalNormUDF method exec.

public Tuple exec(Tuple input) throws IOException {
    if (isCsvFormat) {
        String firstCol = ((input.get(0) == null) ? "" : input.get(0).toString());
        if (this.headers[0].equals(CommonUtils.normColumnName(firstCol))) {
            // TODO what to do if the column value == column name? ...
            return null;
        }
    }
    if (this.modelRunner == null && this.isAppendScore) {
        // here to initialize modelRunner, this is moved from constructor to here to avoid OOM in client side.
        // UDF in pig client will be initialized to get some metadata issues
        @SuppressWarnings("deprecation") List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
        this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models);
        this.modelRunner.setScoreScale(Integer.parseInt(this.scale));
    }
    Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.headers, this.segFilterSize);
    if (MapUtils.isEmpty(rawDataNsMap)) {
        return null;
    }
    Tuple tuple = TupleFactory.getInstance().newTuple();
    for (int i = 0; i < this.outputNames.size(); i++) {
        String name = this.outputNames.get(i);
        String raw = rawDataNsMap.get(new NSColumn(name));
        if (i == 0) {
            tuple.append(raw);
        } else if (i == 1) {
            tuple.append(StringUtils.isEmpty(raw) ? "1" : raw);
        } else if (i > 1 && i < 2 + validMetaSize) {
            // [2, 2 + validMetaSize) are meta columns
            tuple.append(raw);
        } else {
            ColumnConfig columnConfig = this.columnConfigMap.get(name);
            List<Double> normVals = Normalizer.normalize(columnConfig, raw, this.modelConfig.getNormalizeStdDevCutOff(), this.modelConfig.getNormalizeType());
            if (this.isOutputRaw) {
                tuple.append(raw);
            }
            for (Double normVal : normVals) {
                tuple.append(getOutputValue(normVal, true));
            }
        }
    }
    if (this.isAppendScore && this.modelRunner != null) {
        CaseScoreResult score = this.modelRunner.computeNsData(rawDataNsMap);
        if (this.modelRunner == null || this.modelRunner.getModelsCnt() == 0 || score == null) {
            tuple.append(-999.0);
        } else if (this.scIndex < 0) {
            tuple.append(score.getAvgScore());
        } else {
            tuple.append(score.getScores().get(this.scIndex));
        }
    }
    return tuple;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) BasicML(org.encog.ml.BasicML) CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) Tuple(org.apache.pig.data.Tuple) ModelRunner(ml.shifu.shifu.core.ModelRunner) NSColumn(ml.shifu.shifu.column.NSColumn)

Aggregations

ModelRunner (ml.shifu.shifu.core.ModelRunner)3 BasicML (org.encog.ml.BasicML)3 NSColumn (ml.shifu.shifu.column.NSColumn)2 CaseScoreResult (ml.shifu.shifu.container.CaseScoreResult)2 Tuple (org.apache.pig.data.Tuple)2 Map (java.util.Map)1 Entry (java.util.Map.Entry)1 SortedMap (java.util.SortedMap)1 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)1 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)1 DataPurifier (ml.shifu.shifu.core.DataPurifier)1 ModelSpec (ml.shifu.shifu.core.model.ModelSpec)1 IntWritable (org.apache.hadoop.io.IntWritable)1 NullWritable (org.apache.hadoop.io.NullWritable)1 Text (org.apache.hadoop.io.Text)1 TaskInputOutputContext (org.apache.hadoop.mapreduce.TaskInputOutputContext)1