use of ml.shifu.shifu.container.CaseScoreResult in project shifu by ShifuML.
the class PostTrainMapper method map.
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String valueStr = value.toString();
// StringUtils.isBlank is not used here to avoid import new jar
if (valueStr == null || valueStr.length() == 0 || valueStr.trim().length() == 0) {
LOG.warn("Empty input.");
return;
}
if (!this.dataPurifier.isFilter(valueStr)) {
return;
}
String[] units = CommonUtils.split(valueStr, this.modelConfig.getDataSetDelimiter());
// tagColumnNum should be in units array, if not IndexOutofBoundException
String tag = CommonUtils.trimTag(units[this.tagColumnNum]);
if (!this.tags.contains(tag)) {
if (System.currentTimeMillis() % 20 == 0) {
LOG.warn("Data with invalid tag is ignored in post train, invalid tag: {}.", tag);
}
context.getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1L);
return;
}
Map<String, String> rawDataMap = buildRawDataMap(units);
CaseScoreResult csr = this.modelRunner.compute(rawDataMap);
// store score value
StringBuilder sb = new StringBuilder(500);
sb.append(csr.getAvgScore()).append(Constants.DEFAULT_DELIMITER).append(csr.getMaxScore()).append(Constants.DEFAULT_DELIMITER).append(csr.getMinScore()).append(Constants.DEFAULT_DELIMITER);
for (Double score : csr.getScores()) {
sb.append(score).append(Constants.DEFAULT_DELIMITER);
}
List<String> metaList = modelConfig.getMetaColumnNames();
for (String meta : metaList) {
sb.append(rawDataMap.get(meta)).append(Constants.DEFAULT_DELIMITER);
}
sb.deleteCharAt(sb.length() - Constants.DEFAULT_DELIMITER.length());
this.outputValue.set(sb.toString());
this.mos.write(Constants.POST_TRAIN_OUTPUT_SCORE, NullWritable.get(), this.outputValue);
for (int i = 0; i < headers.length; i++) {
ColumnConfig config = this.columnConfigList.get(i);
if (!config.isMeta() && !config.isTarget() && config.isFinalSelect()) {
int binNum = BinUtils.getBinNum(config, units[i]);
List<BinStats> feaureStatistics = this.variableStatsMap.get(config.getColumnNum());
BinStats bs = null;
if (binNum == -1) {
// if -1, means invalid numeric value like null or empty, last one is for empty stats.
bs = feaureStatistics.get(feaureStatistics.size() - 1);
} else {
bs = feaureStatistics.get(binNum);
}
// bs should not be null as already initialized in setup
bs.setBinSum(csr.getAvgScore() + bs.getBinSum());
bs.setBinCnt(1L + bs.getBinCnt());
}
}
}
use of ml.shifu.shifu.container.CaseScoreResult in project shifu by ShifuML.
the class ScoreModelWorker method handleMsg.
/*
* (non-Javadoc)
*
* @see akka.actor.UntypedActor#onReceive(java.lang.Object)
*/
@Override
public void handleMsg(Object message) throws IOException {
if (message instanceof RunModelResultMessage) {
log.debug("Received model score data for evaluation");
RunModelResultMessage msg = (RunModelResultMessage) message;
if (!resultMap.containsKey(msg.getStreamId())) {
receivedStreamCnt++;
resultMap.put(msg.getStreamId(), new StreamBulletin(msg.getStreamId()));
}
resultMap.get(msg.getStreamId()).receiveMsge(msg.getMsgId(), msg.isLastMsg());
List<CaseScoreResult> caseScoreResultList = msg.getScoreResultList();
StringBuilder buf = new StringBuilder();
for (CaseScoreResult csResult : caseScoreResultList) {
buf.setLength(0);
Map<String, String> rawDataMap = CommonUtils.convertDataIntoMap(csResult.getInputData(), evalConfig.getDataSet().getDataDelimiter(), header);
// get the tag
String tag = CommonUtils.trimTag(rawDataMap.get(modelConfig.getTargetColumnName(evalConfig)));
buf.append(tag);
// append weight column value
if (StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName())) {
String metric = rawDataMap.get(evalConfig.getDataSet().getWeightColumnName());
buf.append("|" + StringUtils.trimToEmpty(metric));
} else {
buf.append("|" + "1.0");
}
if (CollectionUtils.isNotEmpty(csResult.getScores())) {
addModelScoreData(buf, csResult);
}
Map<String, CaseScoreResult> subModelScores = csResult.getSubModelScores();
if (MapUtils.isNotEmpty(subModelScores)) {
Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<String, CaseScoreResult> entry = iterator.next();
CaseScoreResult subCs = entry.getValue();
addModelScoreData(buf, subCs);
}
}
// append meta data
List<String> metaColumns = evalConfig.getAllMetaColumns(modelConfig);
if (CollectionUtils.isNotEmpty(metaColumns)) {
for (String columnName : metaColumns) {
String value = rawDataMap.get(columnName);
buf.append("|" + StringUtils.trimToEmpty(value));
}
}
scoreWriter.write(buf.toString() + "\n");
}
if (receivedStreamCnt == msg.getTotalStreamCnt() && hasAllMessageResult(resultMap)) {
log.info("Finish running scoring, the score file - {} is stored in {}.", new PathFinder(modelConfig).getEvalScorePath(evalConfig).toString(), evalConfig.getDataSet().getSource().name());
scoreWriter.close();
// only one message will be sent
nextActorRef.tell(new EvalResultMessage(1), this.getSelf());
}
} else {
unhandled(message);
}
}
use of ml.shifu.shifu.container.CaseScoreResult in project shifu by ShifuML.
the class RunModelWorker method handleMsg.
/*
* (non-Javadoc)
*
* @see akka.actor.UntypedActor#onReceive(java.lang.Object)
*/
@Override
public void handleMsg(Object message) {
if (message instanceof RunModelDataMessage) {
RunModelDataMessage msg = (RunModelDataMessage) message;
List<String> evalDataList = msg.getEvalDataList();
List<CaseScoreResult> scoreDataList = new ArrayList<CaseScoreResult>(evalDataList.size());
for (String evalData : evalDataList) {
CaseScoreResult scoreData = calculateModelScore(evalData);
if (scoreData != null) {
scoreData.setInputData(evalData);
scoreDataList.add(scoreData);
}
}
nextActorRef.tell(new RunModelResultMessage(msg.getStreamId(), msg.getTotalStreamCnt(), msg.getMsgId(), msg.isLastMsg(), scoreDataList), getSelf());
} else {
unhandled(message);
}
}
use of ml.shifu.shifu.container.CaseScoreResult in project shifu by ShifuML.
the class EvalScoreUDF method exec.
@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
if (isCsvFormat) {
String firstCol = ((input.get(0) == null) ? "" : input.get(0).toString());
if (this.headers[0].equals(CommonUtils.normColumnName(firstCol))) {
// TODO what to do if the column value == column name? ...
return null;
}
}
long start = System.currentTimeMillis();
if (this.modelRunner == null) {
// here to initialize modelRunner, this is moved from constructor to here to avoid OOM in client side.
// UDF in pig client will be initialized to get some metadata issues
List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
List<ModelSpec> subModels = ModelSpecLoaderUtils.loadSubModels(modelConfig, this.columnConfigList, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
if (CollectionUtils.isNotEmpty(subModels)) {
for (ModelSpec modelSpec : subModels) {
this.modelRunner.addSubModels(modelSpec, this.isMultiThreadScoring);
this.subModelsCnt.put(modelSpec.getModelName(), modelSpec.getModels().size());
}
}
this.modelCnt = models.size();
// reset models in classfication case
if (modelConfig.isClassification()) {
if (modelConfig.getTrain().isOneVsAll()) {
if (modelConfig.getTags().size() == 2) {
// onevsall, modelcnt is 1
this.modelCnt = 1;
} else {
this.modelCnt = modelConfig.getTags().size();
}
} else {
if (modelConfig.getTags().size() == 2) {
// native binary
this.modelCnt = 1;
} else {
// native multiple classification model cnt is bagging num
this.modelCnt = (this.modelCnt >= modelConfig.getBaggingNum() ? modelConfig.getBaggingNum() : this.modelCnt);
}
}
// reset models to
models = models.subList(0, this.modelCnt);
this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
}
this.modelRunner.setScoreScale(Integer.parseInt(this.scale));
log.info("DEBUG: model cnt " + this.modelCnt + " sub models cnt " + modelRunner.getSubModelsCnt());
}
Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.headers, this.segFilterSize);
if (MapUtils.isEmpty(rawDataNsMap)) {
return null;
}
String tag = CommonUtils.trimTag(rawDataNsMap.get(new NSColumn(modelConfig.getTargetColumnName(evalConfig))));
// filter invalid tag record out
// disable the tag check, since there is no bad tag in eval data set
// and user just want to score the data, but don't run performance evaluation
/*
* if(!tagSet.contains(tag)) {
* if(System.currentTimeMillis() % 100 == 0) {
* log.warn("Invalid tag: " + tag);
* }
* if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
* PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_RECORDS)
* .increment(1);
* }
* return null;
* }
*/
long startTime = System.nanoTime();
CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap);
long runInterval = (System.nanoTime() - startTime) / 1000L;
if (cs == null) {
if (System.currentTimeMillis() % 100 == 0) {
log.warn("Get null result, for input: " + input.toDelimitedString("|"));
}
return null;
}
Tuple tuple = TupleFactory.getInstance().newTuple();
tuple.append(tag);
String weight = null;
if (StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName())) {
weight = rawDataNsMap.get(new NSColumn(evalConfig.getDataSet().getWeightColumnName()));
} else {
weight = "1.0";
}
incrementTagCounters(tag, weight, runInterval);
Map<String, CaseScoreResult> subModelScores = cs.getSubModelScores();
tuple.append(weight);
if (this.isLinearTarget || modelConfig.isRegression()) {
if (CollectionUtils.isNotEmpty(cs.getScores())) {
appendModelScore(tuple, cs, true);
if (this.outputHiddenLayerIndex != 0) {
appendFirstHiddenOutputScore(tuple, cs.getHiddenLayerScores(), true);
}
}
if (MapUtils.isNotEmpty(subModelScores)) {
Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<String, CaseScoreResult> entry = iterator.next();
CaseScoreResult subCs = entry.getValue();
appendModelScore(tuple, subCs, false);
}
}
} else {
if (CollectionUtils.isNotEmpty(cs.getScores())) {
appendSimpleScore(tuple, cs);
tuple.append(this.mcPredictor.predictTag(cs).getTag());
}
if (MapUtils.isNotEmpty(subModelScores)) {
Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<String, CaseScoreResult> entry = iterator.next();
CaseScoreResult subCs = entry.getValue();
appendSimpleScore(tuple, subCs);
}
}
}
// append meta data
List<String> metaColumns = evalConfig.getAllMetaColumns(modelConfig);
if (CollectionUtils.isNotEmpty(metaColumns)) {
for (String meta : metaColumns) {
tuple.append(rawDataNsMap.get(new NSColumn(meta)));
}
}
if (System.currentTimeMillis() % 1000 == 0L) {
log.info("running time is " + (System.currentTimeMillis() - start) + " ms.");
}
return tuple;
}
use of ml.shifu.shifu.container.CaseScoreResult in project shifu by ShifuML.
the class SimpleScoreUDF method exec.
public Double exec(Tuple input) throws IOException {
Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.header, 0);
CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap);
if (cs == null) {
log.error("Get null result.");
return null;
}
String tag = CommonUtils.trimTag(rawDataNsMap.get(new NSColumn(targetColumnName)));
if (!(negTags.contains(tag) || posTags.contains(tag))) {
// invalid record
log.error("Detected invalid record. Its tag is - " + tag);
return null;
}
return cs.getAvgScore();
}
Aggregations