Search in sources :

Example 1 with CaseScoreResult

use of ml.shifu.shifu.container.CaseScoreResult in project shifu by ShifuML.

the class PostTrainMapper method map.

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String valueStr = value.toString();
    // StringUtils.isBlank is not used here to avoid import new jar
    if (valueStr == null || valueStr.length() == 0 || valueStr.trim().length() == 0) {
        LOG.warn("Empty input.");
        return;
    }
    if (!this.dataPurifier.isFilter(valueStr)) {
        return;
    }
    String[] units = CommonUtils.split(valueStr, this.modelConfig.getDataSetDelimiter());
    // tagColumnNum should be in units array, if not IndexOutofBoundException
    String tag = CommonUtils.trimTag(units[this.tagColumnNum]);
    if (!this.tags.contains(tag)) {
        if (System.currentTimeMillis() % 20 == 0) {
            LOG.warn("Data with invalid tag is ignored in post train, invalid tag: {}.", tag);
        }
        context.getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1L);
        return;
    }
    Map<String, String> rawDataMap = buildRawDataMap(units);
    CaseScoreResult csr = this.modelRunner.compute(rawDataMap);
    // store score value
    StringBuilder sb = new StringBuilder(500);
    sb.append(csr.getAvgScore()).append(Constants.DEFAULT_DELIMITER).append(csr.getMaxScore()).append(Constants.DEFAULT_DELIMITER).append(csr.getMinScore()).append(Constants.DEFAULT_DELIMITER);
    for (Double score : csr.getScores()) {
        sb.append(score).append(Constants.DEFAULT_DELIMITER);
    }
    List<String> metaList = modelConfig.getMetaColumnNames();
    for (String meta : metaList) {
        sb.append(rawDataMap.get(meta)).append(Constants.DEFAULT_DELIMITER);
    }
    sb.deleteCharAt(sb.length() - Constants.DEFAULT_DELIMITER.length());
    this.outputValue.set(sb.toString());
    this.mos.write(Constants.POST_TRAIN_OUTPUT_SCORE, NullWritable.get(), this.outputValue);
    for (int i = 0; i < headers.length; i++) {
        ColumnConfig config = this.columnConfigList.get(i);
        if (!config.isMeta() && !config.isTarget() && config.isFinalSelect()) {
            int binNum = BinUtils.getBinNum(config, units[i]);
            List<BinStats> feaureStatistics = this.variableStatsMap.get(config.getColumnNum());
            BinStats bs = null;
            if (binNum == -1) {
                // if -1, means invalid numeric value like null or empty, last one is for empty stats.
                bs = feaureStatistics.get(feaureStatistics.size() - 1);
            } else {
                bs = feaureStatistics.get(binNum);
            }
            // bs should not be null as already initialized in setup
            bs.setBinSum(csr.getAvgScore() + bs.getBinSum());
            bs.setBinCnt(1L + bs.getBinCnt());
        }
    }
}
Also used : CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) BinStats(ml.shifu.shifu.core.posttrain.FeatureStatsWritable.BinStats) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig)

Example 2 with CaseScoreResult

use of ml.shifu.shifu.container.CaseScoreResult in project shifu by ShifuML.

the class ScoreModelWorker method handleMsg.

/*
     * (non-Javadoc)
     * 
     * @see akka.actor.UntypedActor#onReceive(java.lang.Object)
     */
@Override
public void handleMsg(Object message) throws IOException {
    if (message instanceof RunModelResultMessage) {
        log.debug("Received model score data for evaluation");
        RunModelResultMessage msg = (RunModelResultMessage) message;
        if (!resultMap.containsKey(msg.getStreamId())) {
            receivedStreamCnt++;
            resultMap.put(msg.getStreamId(), new StreamBulletin(msg.getStreamId()));
        }
        resultMap.get(msg.getStreamId()).receiveMsge(msg.getMsgId(), msg.isLastMsg());
        List<CaseScoreResult> caseScoreResultList = msg.getScoreResultList();
        StringBuilder buf = new StringBuilder();
        for (CaseScoreResult csResult : caseScoreResultList) {
            buf.setLength(0);
            Map<String, String> rawDataMap = CommonUtils.convertDataIntoMap(csResult.getInputData(), evalConfig.getDataSet().getDataDelimiter(), header);
            // get the tag
            String tag = CommonUtils.trimTag(rawDataMap.get(modelConfig.getTargetColumnName(evalConfig)));
            buf.append(tag);
            // append weight column value
            if (StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName())) {
                String metric = rawDataMap.get(evalConfig.getDataSet().getWeightColumnName());
                buf.append("|" + StringUtils.trimToEmpty(metric));
            } else {
                buf.append("|" + "1.0");
            }
            if (CollectionUtils.isNotEmpty(csResult.getScores())) {
                addModelScoreData(buf, csResult);
            }
            Map<String, CaseScoreResult> subModelScores = csResult.getSubModelScores();
            if (MapUtils.isNotEmpty(subModelScores)) {
                Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
                while (iterator.hasNext()) {
                    Map.Entry<String, CaseScoreResult> entry = iterator.next();
                    CaseScoreResult subCs = entry.getValue();
                    addModelScoreData(buf, subCs);
                }
            }
            // append meta data
            List<String> metaColumns = evalConfig.getAllMetaColumns(modelConfig);
            if (CollectionUtils.isNotEmpty(metaColumns)) {
                for (String columnName : metaColumns) {
                    String value = rawDataMap.get(columnName);
                    buf.append("|" + StringUtils.trimToEmpty(value));
                }
            }
            scoreWriter.write(buf.toString() + "\n");
        }
        if (receivedStreamCnt == msg.getTotalStreamCnt() && hasAllMessageResult(resultMap)) {
            log.info("Finish running scoring, the score file - {} is stored in {}.", new PathFinder(modelConfig).getEvalScorePath(evalConfig).toString(), evalConfig.getDataSet().getSource().name());
            scoreWriter.close();
            // only one message will be sent
            nextActorRef.tell(new EvalResultMessage(1), this.getSelf());
        }
    } else {
        unhandled(message);
    }
}
Also used : PathFinder(ml.shifu.shifu.fs.PathFinder) CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) EvalResultMessage(ml.shifu.shifu.message.EvalResultMessage) Entry(java.util.Map.Entry) RunModelResultMessage(ml.shifu.shifu.message.RunModelResultMessage)

Example 3 with CaseScoreResult

use of ml.shifu.shifu.container.CaseScoreResult in project shifu by ShifuML.

the class RunModelWorker method handleMsg.

/*
     * (non-Javadoc)
     * 
     * @see akka.actor.UntypedActor#onReceive(java.lang.Object)
     */
@Override
public void handleMsg(Object message) {
    if (message instanceof RunModelDataMessage) {
        RunModelDataMessage msg = (RunModelDataMessage) message;
        List<String> evalDataList = msg.getEvalDataList();
        List<CaseScoreResult> scoreDataList = new ArrayList<CaseScoreResult>(evalDataList.size());
        for (String evalData : evalDataList) {
            CaseScoreResult scoreData = calculateModelScore(evalData);
            if (scoreData != null) {
                scoreData.setInputData(evalData);
                scoreDataList.add(scoreData);
            }
        }
        nextActorRef.tell(new RunModelResultMessage(msg.getStreamId(), msg.getTotalStreamCnt(), msg.getMsgId(), msg.isLastMsg(), scoreDataList), getSelf());
    } else {
        unhandled(message);
    }
}
Also used : CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) ArrayList(java.util.ArrayList) RunModelResultMessage(ml.shifu.shifu.message.RunModelResultMessage) RunModelDataMessage(ml.shifu.shifu.message.RunModelDataMessage)

Example 4 with CaseScoreResult

use of ml.shifu.shifu.container.CaseScoreResult in project shifu by ShifuML.

the class EvalScoreUDF method exec.

@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
    if (isCsvFormat) {
        String firstCol = ((input.get(0) == null) ? "" : input.get(0).toString());
        if (this.headers[0].equals(CommonUtils.normColumnName(firstCol))) {
            // TODO what to do if the column value == column name? ...
            return null;
        }
    }
    long start = System.currentTimeMillis();
    if (this.modelRunner == null) {
        // here to initialize modelRunner, this is moved from constructor to here to avoid OOM in client side.
        // UDF in pig client will be initialized to get some metadata issues
        List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
        this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
        List<ModelSpec> subModels = ModelSpecLoaderUtils.loadSubModels(modelConfig, this.columnConfigList, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
        if (CollectionUtils.isNotEmpty(subModels)) {
            for (ModelSpec modelSpec : subModels) {
                this.modelRunner.addSubModels(modelSpec, this.isMultiThreadScoring);
                this.subModelsCnt.put(modelSpec.getModelName(), modelSpec.getModels().size());
            }
        }
        this.modelCnt = models.size();
        // reset models in classfication case
        if (modelConfig.isClassification()) {
            if (modelConfig.getTrain().isOneVsAll()) {
                if (modelConfig.getTags().size() == 2) {
                    // onevsall, modelcnt is 1
                    this.modelCnt = 1;
                } else {
                    this.modelCnt = modelConfig.getTags().size();
                }
            } else {
                if (modelConfig.getTags().size() == 2) {
                    // native binary
                    this.modelCnt = 1;
                } else {
                    // native multiple classification model cnt is bagging num
                    this.modelCnt = (this.modelCnt >= modelConfig.getBaggingNum() ? modelConfig.getBaggingNum() : this.modelCnt);
                }
            }
            // reset models to
            models = models.subList(0, this.modelCnt);
            this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
        }
        this.modelRunner.setScoreScale(Integer.parseInt(this.scale));
        log.info("DEBUG: model cnt " + this.modelCnt + " sub models cnt " + modelRunner.getSubModelsCnt());
    }
    Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.headers, this.segFilterSize);
    if (MapUtils.isEmpty(rawDataNsMap)) {
        return null;
    }
    String tag = CommonUtils.trimTag(rawDataNsMap.get(new NSColumn(modelConfig.getTargetColumnName(evalConfig))));
    // filter invalid tag record out
    // disable the tag check, since there is no bad tag in eval data set
    // and user just want to score the data, but don't run performance evaluation
    /*
         * if(!tagSet.contains(tag)) {
         * if(System.currentTimeMillis() % 100 == 0) {
         * log.warn("Invalid tag: " + tag);
         * }
         * if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
         * PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_RECORDS)
         * .increment(1);
         * }
         * return null;
         * }
         */
    long startTime = System.nanoTime();
    CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap);
    long runInterval = (System.nanoTime() - startTime) / 1000L;
    if (cs == null) {
        if (System.currentTimeMillis() % 100 == 0) {
            log.warn("Get null result, for input: " + input.toDelimitedString("|"));
        }
        return null;
    }
    Tuple tuple = TupleFactory.getInstance().newTuple();
    tuple.append(tag);
    String weight = null;
    if (StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName())) {
        weight = rawDataNsMap.get(new NSColumn(evalConfig.getDataSet().getWeightColumnName()));
    } else {
        weight = "1.0";
    }
    incrementTagCounters(tag, weight, runInterval);
    Map<String, CaseScoreResult> subModelScores = cs.getSubModelScores();
    tuple.append(weight);
    if (this.isLinearTarget || modelConfig.isRegression()) {
        if (CollectionUtils.isNotEmpty(cs.getScores())) {
            appendModelScore(tuple, cs, true);
            if (this.outputHiddenLayerIndex != 0) {
                appendFirstHiddenOutputScore(tuple, cs.getHiddenLayerScores(), true);
            }
        }
        if (MapUtils.isNotEmpty(subModelScores)) {
            Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry<String, CaseScoreResult> entry = iterator.next();
                CaseScoreResult subCs = entry.getValue();
                appendModelScore(tuple, subCs, false);
            }
        }
    } else {
        if (CollectionUtils.isNotEmpty(cs.getScores())) {
            appendSimpleScore(tuple, cs);
            tuple.append(this.mcPredictor.predictTag(cs).getTag());
        }
        if (MapUtils.isNotEmpty(subModelScores)) {
            Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry<String, CaseScoreResult> entry = iterator.next();
                CaseScoreResult subCs = entry.getValue();
                appendSimpleScore(tuple, subCs);
            }
        }
    }
    // append meta data
    List<String> metaColumns = evalConfig.getAllMetaColumns(modelConfig);
    if (CollectionUtils.isNotEmpty(metaColumns)) {
        for (String meta : metaColumns) {
            tuple.append(rawDataNsMap.get(new NSColumn(meta)));
        }
    }
    if (System.currentTimeMillis() % 1000 == 0L) {
        log.info("running time is " + (System.currentTimeMillis() - start) + " ms.");
    }
    return tuple;
}
Also used : BasicML(org.encog.ml.BasicML) CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) Entry(java.util.Map.Entry) ModelSpec(ml.shifu.shifu.core.model.ModelSpec) Map(java.util.Map) SortedMap(java.util.SortedMap) Tuple(org.apache.pig.data.Tuple) ModelRunner(ml.shifu.shifu.core.ModelRunner) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 5 with CaseScoreResult

use of ml.shifu.shifu.container.CaseScoreResult in project shifu by ShifuML.

the class SimpleScoreUDF method exec.

public Double exec(Tuple input) throws IOException {
    Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.header, 0);
    CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap);
    if (cs == null) {
        log.error("Get null result.");
        return null;
    }
    String tag = CommonUtils.trimTag(rawDataNsMap.get(new NSColumn(targetColumnName)));
    if (!(negTags.contains(tag) || posTags.contains(tag))) {
        // invalid record
        log.error("Detected invalid record. Its tag is - " + tag);
        return null;
    }
    return cs.getAvgScore();
}
Also used : CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) NSColumn(ml.shifu.shifu.column.NSColumn)

Aggregations

CaseScoreResult (ml.shifu.shifu.container.CaseScoreResult)9 NSColumn (ml.shifu.shifu.column.NSColumn)4 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)3 Tuple (org.apache.pig.data.Tuple)3 ArrayList (java.util.ArrayList)2 Map (java.util.Map)2 Entry (java.util.Map.Entry)2 ModelRunner (ml.shifu.shifu.core.ModelRunner)2 RunModelResultMessage (ml.shifu.shifu.message.RunModelResultMessage)2 BasicML (org.encog.ml.BasicML)2 SortedMap (java.util.SortedMap)1 TreeMap (java.util.TreeMap)1 Callable (java.util.concurrent.Callable)1 ColumnScoreObject (ml.shifu.shifu.container.ColumnScoreObject)1 ScoreObject (ml.shifu.shifu.container.ScoreObject)1 ModelSpec (ml.shifu.shifu.core.model.ModelSpec)1 BinStats (ml.shifu.shifu.core.posttrain.FeatureStatsWritable.BinStats)1 PathFinder (ml.shifu.shifu.fs.PathFinder)1 EvalResultMessage (ml.shifu.shifu.message.EvalResultMessage)1 RunModelDataMessage (ml.shifu.shifu.message.RunModelDataMessage)1