use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class VarSelectModelProcessor method postProcessFIVarSelect.
private void postProcessFIVarSelect(Map<Integer, MutablePair<String, Double>> importances) throws IOException {
int selectCnt = 0;
for (ColumnConfig config : super.columnConfigList) {
// enable ForceSelect
if (config.isForceSelect()) {
config.setFinalSelect(true);
selectCnt++;
log.info("Variable {} is selected, since it is in ForceSelect list.", config.getColumnName());
}
}
VariableSelector.setFilterNumberByFilterOutRatio(this.modelConfig, this.columnConfigList);
int targetCnt = this.modelConfig.getVarSelectFilterNum();
List<Integer> candidateColumnIdList = new ArrayList<Integer>();
candidateColumnIdList.addAll(importances.keySet());
int i = 0;
int candidateCount = candidateColumnIdList.size();
// force-selected variables
for (ColumnConfig columnConfig : this.columnConfigList) {
if (columnConfig.isFinalSelect()) {
columnConfig.setFinalSelect(true);
}
}
Set<NSColumn> userCandidateColumns = CommonUtils.loadCandidateColumns(modelConfig);
while (selectCnt < targetCnt && i < targetCnt) {
if (i >= candidateCount) {
log.warn("Var select finish due to feature importance count {} is less than target var count {}", candidateCount, targetCnt);
break;
}
Integer columnId = candidateColumnIdList.get(i++);
ColumnConfig columnConfig = this.columnConfigList.get(columnId);
if (CollectionUtils.isNotEmpty(userCandidateColumns) && !userCandidateColumns.contains(new NSColumn(columnConfig.getColumnName()))) {
log.info("Variable {} is not in user's candidate list. Skip it.", columnConfig.getColumnName());
} else if (!columnConfig.isForceSelect() && !columnConfig.isForceRemove()) {
columnConfig.setFinalSelect(true);
selectCnt++;
log.info("Variable {} is selected.", columnConfig.getColumnName());
}
}
log.info("{} variables are selected.", selectCnt);
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class VariableSelector method selectByFilter.
// return the list of selected column nums
public List<ColumnConfig> selectByFilter() throws IOException {
log.info(" - Method: Filter");
int ptrKs = 0, ptrIv = 0, ptrPareto = 0, cntByForce = 0;
VariableSelector.setFilterNumberByFilterOutRatio(this.modelConfig, this.columnConfigList);
log.info("Start Variable Selection...");
log.info("\t VarSelectEnabled: " + modelConfig.getVarSelectFilterEnabled());
log.info("\t VarSelectBy: " + modelConfig.getVarSelectFilterBy());
log.info("\t VarSelectNum: " + modelConfig.getVarSelectFilterNum());
List<Integer> selectedColumnNumList = new ArrayList<Integer>();
List<ColumnConfig> ksList = new ArrayList<ColumnConfig>();
List<ColumnConfig> ivList = new ArrayList<ColumnConfig>();
List<Tuple> paretoList = new ArrayList<Tuple>();
Set<NSColumn> candidateColumns = CommonUtils.loadCandidateColumns(modelConfig);
boolean hasCandidates = CommonUtils.hasCandidateColumns(columnConfigList);
int cntSelected = 0;
for (ColumnConfig config : this.columnConfigList) {
if (config == null) {
continue;
}
if (config.isMeta() || config.isTarget()) {
log.info("\t Skip meta, weight or target column: " + config.getColumnName());
} else if (config.isForceRemove()) {
log.info("\t ForceRemove: " + config.getColumnName());
} else if (config.isForceSelect()) {
log.info("\t ForceSelect: " + config.getColumnName());
if (config.getMean() == null || config.getStdDev() == null) {
// TODO - check the mean of categorical variable could be null
log.info("\t ForceSelect Failed: mean/stdDev must not be null");
} else {
selectedColumnNumList.add(config.getColumnNum());
cntSelected++;
cntByForce++;
}
} else if (!CommonUtils.isGoodCandidate(config, hasCandidates)) {
log.info("\t Incomplete info(please check KS, IV, Mean, or StdDev fields): " + config.getColumnName() + " or it is not in candidate list");
} else if (CollectionUtils.isNotEmpty(candidateColumns) && !candidateColumns.contains(new NSColumn(config.getColumnName()))) {
log.info("\t Not in candidate list, Skip: " + config.getColumnName());
} else if ((config.isCategorical() && !modelConfig.isCategoricalDisabled()) || config.isNumerical()) {
ksList.add(config);
ivList.add(config);
if (config != null && config.getColumnStats() != null) {
Double ks = config.getKs();
Double iv = config.getIv();
paretoList.add(new Tuple(config.getColumnNum(), ks == null ? 0d : ks, iv == null ? 0d : iv));
}
}
}
// not enabled filter, so only select forceSelect columns
if (!this.modelConfig.getVarSelectFilterEnabled()) {
log.info("Summary:");
log.info("\tSelected Variables: " + cntSelected);
if (cntByForce != 0) {
log.info("\t- By Force: " + cntByForce);
}
for (int n : selectedColumnNumList) {
this.columnConfigList.get(n).setFinalSelect(true);
}
return columnConfigList;
}
String key = this.modelConfig.getVarSelectFilterBy();
Collections.sort(ksList, new ColumnConfigComparator("ks"));
Collections.sort(ivList, new ColumnConfigComparator("iv"));
List<Tuple> newParetoList = sortByPareto(paretoList);
int expectedVarNum = Math.min(cntSelected + ksList.size(), modelConfig.getVarSelectFilterNum());
log.info("Expected selected columns:" + expectedVarNum);
// reset to false at first.
resetFinalSelect();
ColumnConfig config = null;
while (cntSelected < expectedVarNum) {
if (key.equalsIgnoreCase("ks")) {
config = ksList.get(ptrKs);
selectedColumnNumList.add(config.getColumnNum());
ptrKs++;
log.info("\t SelectedByKS=" + config.getKs() + "(Rank=" + ptrKs + "): " + config.getColumnName());
cntSelected++;
} else if (key.equalsIgnoreCase("iv")) {
config = ivList.get(ptrIv);
selectedColumnNumList.add(config.getColumnNum());
ptrIv++;
log.info("\t SelectedByIV=" + config.getIv() + "(Rank=" + ptrIv + "): " + config.getColumnName());
cntSelected++;
} else if (key.equalsIgnoreCase("mix")) {
config = ksList.get(ptrKs);
if (selectedColumnNumList.contains(config.getColumnNum())) {
log.info("\t Variable Already Selected: " + config.getColumnName());
ptrKs++;
} else {
selectedColumnNumList.add(config.getColumnNum());
ptrKs++;
log.info("\t SelectedByKS=" + config.getKs() + "(Rank=" + ptrKs + "): " + config.getColumnName());
cntSelected++;
}
if (cntSelected == expectedVarNum) {
break;
}
config = ivList.get(ptrIv);
if (selectedColumnNumList.contains(config.getColumnNum())) {
log.info("\t Variable Already Selected: " + config.getColumnName());
ptrIv++;
} else {
selectedColumnNumList.add(config.getColumnNum());
ptrIv++;
log.info("\t SelectedByIV=" + config.getIv() + "(Rank=" + ptrIv + "): " + config.getColumnName());
cntSelected++;
}
} else if (key.equalsIgnoreCase("pareto")) {
if (ptrPareto >= newParetoList.size()) {
config = ksList.get(ptrKs);
if (selectedColumnNumList.contains(config.getColumnNum())) {
log.info("\t Variable Already Selected: " + config.getColumnName());
} else {
selectedColumnNumList.add(config.getColumnNum());
log.info("\t SelectedByKS=" + config.getKs() + "(Rank=" + ptrKs + newParetoList.size() + "): " + config.getColumnName());
cntSelected++;
}
ptrKs++;
} else {
int columnNum = newParetoList.get(ptrPareto).columnNum;
selectedColumnNumList.add(columnNum);
log.info("\t SelectedByPareto " + columnConfigList.get(columnNum).getColumnName());
ptrPareto++;
cntSelected++;
}
}
}
log.info("Summary:");
log.info("\t Selected Variables: " + cntSelected);
if (cntByForce != 0) {
log.info("\t - By Force: " + cntByForce);
}
if (ptrPareto != 0) {
log.info("\t - By Pareto: " + ptrPareto);
}
if (ptrKs != 0) {
log.info("\t - By KS: " + ptrKs);
}
if (ptrIv != 0) {
log.info("\t - By IV: " + ptrIv);
}
// update column config list and set finalSelect to true
for (int n : selectedColumnNumList) {
// get ColumnConfig by column id. The id may not the position in array list after support segments
ColumnConfig columnConfig = CommonUtils.getColumnConfig(this.columnConfigList, n);
if (columnConfig != null) {
columnConfig.setFinalSelect(true);
}
}
return columnConfigList;
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class DataPurifier method isFilter.
public Boolean isFilter(String record) {
if (dataFilterExpr == null) {
return true;
}
String[] fields = CommonUtils.split(record, dataDelimiter);
if (fields == null || fields.length != headers.length) {
// illegal format data, just skip
return false;
}
jc.clear();
for (int i = 0; i < fields.length; i++) {
NSColumn nsColumn = new NSColumn(headers[i]);
jc.set(headers[i], (fields[i] == null ? "" : fields[i]));
jc.set(nsColumn.getSimpleName(), (fields[i] == null ? "" : fields[i]));
}
Boolean result = Boolean.FALSE;
Object retObj = null;
try {
retObj = dataFilterExpr.evaluate(jc);
} catch (Throwable e) {
if (this.jexl.isStrict()) {
throw new RuntimeException(e);
} else {
log.error("Error occurred when trying to evaluate " + dataFilterExpr.toString(), e);
}
}
if (retObj != null && retObj instanceof Boolean) {
result = (Boolean) retObj;
} else if (retObj != null && !(retObj instanceof Boolean)) {
throw new InvalidFilterResultExcetion("Invalid filter return not boolean type: " + dataFilterExpr.getExpression());
}
return result;
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class EvalModelProcessor method validateEvalColumnConfig.
@SuppressWarnings("deprecation")
private void validateEvalColumnConfig(EvalConfig evalConfig) throws IOException {
if (this.columnConfigList == null) {
return;
}
String[] evalColumnNames = null;
if (StringUtils.isNotBlank(evalConfig.getDataSet().getHeaderPath())) {
String delimiter = // get header delimiter
StringUtils.isBlank(evalConfig.getDataSet().getHeaderDelimiter()) ? evalConfig.getDataSet().getDataDelimiter() : evalConfig.getDataSet().getHeaderDelimiter();
evalColumnNames = CommonUtils.getHeaders(evalConfig.getDataSet().getHeaderPath(), delimiter, evalConfig.getDataSet().getSource());
} else {
String delimiter = // get header delimiter
StringUtils.isBlank(evalConfig.getDataSet().getHeaderDelimiter()) ? evalConfig.getDataSet().getDataDelimiter() : evalConfig.getDataSet().getHeaderDelimiter();
String[] fields = CommonUtils.takeFirstLine(evalConfig.getDataSet().getDataPath(), delimiter, evalConfig.getDataSet().getSource());
// if first line contains target column name, we guess it is csv format and first line is header.
String evalTargetColumnName = ((StringUtils.isBlank(evalConfig.getDataSet().getTargetColumnName())) ? modelConfig.getTargetColumnName() : evalConfig.getDataSet().getTargetColumnName());
if (StringUtils.join(fields, "").contains(evalTargetColumnName)) {
// first line of data meaning second line in data files excluding first header line
String[] dataInFirstLine = CommonUtils.takeFirstTwoLines(evalConfig.getDataSet().getDataPath(), delimiter, evalConfig.getDataSet().getSource())[1];
if (dataInFirstLine != null && fields.length != dataInFirstLine.length) {
throw new IllegalArgumentException("Eval header length and eval data length are not consistent, please check you header setting and data set setting in eval.");
}
// char or / in its name in shifu will be replaced;
for (int i = 0; i < fields.length; i++) {
fields[i] = CommonUtils.normColumnName(fields[i]);
}
evalColumnNames = fields;
// for(int i = 0; i < fields.length; i++) {
// evalColumnNames[i] = CommonUtils.getRelativePigHeaderColumnName(fields[i]);
// }
LOG.warn("No header path is provided, we will try to read first line and detect schema.");
LOG.warn("Schema in ColumnConfig.json are named as first line of data set path.");
} else {
LOG.warn("No header path is provided, we will try to read first line and detect schema.");
LOG.warn("Schema in ColumnConfig.json are named as index 0, 1, 2, 3 ...");
LOG.warn("Please make sure weight column and tag column are also taking index as name.");
evalColumnNames = new String[fields.length];
for (int i = 0; i < fields.length; i++) {
evalColumnNames[i] = i + "";
}
}
}
Set<NSColumn> names = new HashSet<NSColumn>();
for (String evalColumnName : evalColumnNames) {
names.add(new NSColumn(evalColumnName));
}
String filterExpressions = super.modelConfig.getSegmentFilterExpressionsAsString();
if (StringUtils.isNotBlank(filterExpressions)) {
int segFilterSize = CommonUtils.split(filterExpressions, Constants.SHIFU_STATS_FILTER_EXPRESSIONS_DELIMETER).length;
for (int i = 0; i < segFilterSize; i++) {
for (int j = 0; j < evalColumnNames.length; j++) {
names.add(new NSColumn(evalColumnNames[j] + "_" + (i + 1)));
}
}
}
if (Constants.GENERIC.equalsIgnoreCase(modelConfig.getAlgorithm()) || Constants.TENSORFLOW.equalsIgnoreCase(modelConfig.getAlgorithm())) {
// TODO correct this logic
return;
}
List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, SourceType.LOCAL, evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
if (CollectionUtils.isNotEmpty(models)) {
validateFinalColumns(evalConfig, this.modelConfig.getModelSetName(), false, this.columnConfigList, names);
}
NSColumn targetColumn = new NSColumn(evalConfig.getDataSet().getTargetColumnName());
if (StringUtils.isNotBlank(evalConfig.getDataSet().getTargetColumnName()) && !names.contains(targetColumn) && !names.contains(new NSColumn(targetColumn.getSimpleName()))) {
throw new IllegalArgumentException("Target column " + evalConfig.getDataSet().getTargetColumnName() + " does not exist in - " + evalConfig.getDataSet().getHeaderPath());
}
NSColumn weightColumn = new NSColumn(evalConfig.getDataSet().getWeightColumnName());
if (StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName()) && !names.contains(weightColumn) && !names.contains(new NSColumn(weightColumn.getSimpleName()))) {
throw new IllegalArgumentException("Weight column " + evalConfig.getDataSet().getWeightColumnName() + " does not exist in - " + evalConfig.getDataSet().getHeaderPath());
}
List<ModelSpec> subModels = ModelSpecLoaderUtils.loadSubModels(modelConfig, this.columnConfigList, evalConfig, SourceType.LOCAL, evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
if (CollectionUtils.isNotEmpty(subModels)) {
for (ModelSpec modelSpec : subModels) {
validateFinalColumns(evalConfig, modelSpec.getModelName(), true, modelSpec.getColumnConfigList(), names);
}
}
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class EvalScoreUDF method exec.
@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
if (isCsvFormat) {
String firstCol = ((input.get(0) == null) ? "" : input.get(0).toString());
if (this.headers[0].equals(CommonUtils.normColumnName(firstCol))) {
// TODO what to do if the column value == column name? ...
return null;
}
}
long start = System.currentTimeMillis();
if (this.modelRunner == null) {
// here to initialize modelRunner, this is moved from constructor to here to avoid OOM in client side.
// UDF in pig client will be initialized to get some metadata issues
List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
List<ModelSpec> subModels = ModelSpecLoaderUtils.loadSubModels(modelConfig, this.columnConfigList, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
if (CollectionUtils.isNotEmpty(subModels)) {
for (ModelSpec modelSpec : subModels) {
this.modelRunner.addSubModels(modelSpec, this.isMultiThreadScoring);
this.subModelsCnt.put(modelSpec.getModelName(), modelSpec.getModels().size());
}
}
this.modelCnt = models.size();
// reset models in classfication case
if (modelConfig.isClassification()) {
if (modelConfig.getTrain().isOneVsAll()) {
if (modelConfig.getTags().size() == 2) {
// onevsall, modelcnt is 1
this.modelCnt = 1;
} else {
this.modelCnt = modelConfig.getTags().size();
}
} else {
if (modelConfig.getTags().size() == 2) {
// native binary
this.modelCnt = 1;
} else {
// native multiple classification model cnt is bagging num
this.modelCnt = (this.modelCnt >= modelConfig.getBaggingNum() ? modelConfig.getBaggingNum() : this.modelCnt);
}
}
// reset models to
models = models.subList(0, this.modelCnt);
this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
}
this.modelRunner.setScoreScale(Integer.parseInt(this.scale));
log.info("DEBUG: model cnt " + this.modelCnt + " sub models cnt " + modelRunner.getSubModelsCnt());
}
Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.headers, this.segFilterSize);
if (MapUtils.isEmpty(rawDataNsMap)) {
return null;
}
String tag = CommonUtils.trimTag(rawDataNsMap.get(new NSColumn(modelConfig.getTargetColumnName(evalConfig))));
// filter invalid tag record out
// disable the tag check, since there is no bad tag in eval data set
// and user just want to score the data, but don't run performance evaluation
/*
* if(!tagSet.contains(tag)) {
* if(System.currentTimeMillis() % 100 == 0) {
* log.warn("Invalid tag: " + tag);
* }
* if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
* PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_RECORDS)
* .increment(1);
* }
* return null;
* }
*/
long startTime = System.nanoTime();
CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap);
long runInterval = (System.nanoTime() - startTime) / 1000L;
if (cs == null) {
if (System.currentTimeMillis() % 100 == 0) {
log.warn("Get null result, for input: " + input.toDelimitedString("|"));
}
return null;
}
Tuple tuple = TupleFactory.getInstance().newTuple();
tuple.append(tag);
String weight = null;
if (StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName())) {
weight = rawDataNsMap.get(new NSColumn(evalConfig.getDataSet().getWeightColumnName()));
} else {
weight = "1.0";
}
incrementTagCounters(tag, weight, runInterval);
Map<String, CaseScoreResult> subModelScores = cs.getSubModelScores();
tuple.append(weight);
if (this.isLinearTarget || modelConfig.isRegression()) {
if (CollectionUtils.isNotEmpty(cs.getScores())) {
appendModelScore(tuple, cs, true);
if (this.outputHiddenLayerIndex != 0) {
appendFirstHiddenOutputScore(tuple, cs.getHiddenLayerScores(), true);
}
}
if (MapUtils.isNotEmpty(subModelScores)) {
Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<String, CaseScoreResult> entry = iterator.next();
CaseScoreResult subCs = entry.getValue();
appendModelScore(tuple, subCs, false);
}
}
} else {
if (CollectionUtils.isNotEmpty(cs.getScores())) {
appendSimpleScore(tuple, cs);
tuple.append(this.mcPredictor.predictTag(cs).getTag());
}
if (MapUtils.isNotEmpty(subModelScores)) {
Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<String, CaseScoreResult> entry = iterator.next();
CaseScoreResult subCs = entry.getValue();
appendSimpleScore(tuple, subCs);
}
}
}
// append meta data
List<String> metaColumns = evalConfig.getAllMetaColumns(modelConfig);
if (CollectionUtils.isNotEmpty(metaColumns)) {
for (String meta : metaColumns) {
tuple.append(rawDataNsMap.get(new NSColumn(meta)));
}
}
if (System.currentTimeMillis() % 1000 == 0L) {
log.info("running time is " + (System.currentTimeMillis() - start) + " ms.");
}
return tuple;
}
Aggregations