use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class NormalUtils method assembleNsDataPair.
/**
* Assemble map data to Encog standard input format. If no variable selected(noVarSel = true), all candidate
* variables will be selected.
*
* @param binCategoryMap categorical map
* @param noVarSel if after var select
* @param modelConfig model config instance
* @param columnConfigList column config list
* @param rawNsDataMap raw NSColumn data
* @param cutoff cut off value
* @param alg algorithm used in model
* @param featureSet feature set used in NN model
* @return data pair instance
* @throws NullPointerException if input is null
* @throws NumberFormatException if column value is not number format.
*/
public static MLDataPair assembleNsDataPair(Map<Integer, Map<String, Integer>> binCategoryMap, boolean noVarSel, ModelConfig modelConfig, List<ColumnConfig> columnConfigList, Map<NSColumn, String> rawNsDataMap, double cutoff, String alg, Set<Integer> featureSet) {
if (CollectionUtils.isEmpty(featureSet)) {
return assembleNsDataPair(binCategoryMap, noVarSel, modelConfig, columnConfigList, rawNsDataMap, cutoff, alg);
}
double[] ideal = { Constants.DEFAULT_IDEAL_VALUE };
List<Double> inputList = new ArrayList<Double>();
for (ColumnConfig config : columnConfigList) {
if (config == null) {
continue;
}
NSColumn key = new NSColumn(config.getColumnName());
if (// check whole name
config.isFinalSelect() && // and then check simple name, in case user use wrong namespace
!rawNsDataMap.containsKey(key) && !rawNsDataMap.containsKey(new NSColumn(key.getSimpleName()))) {
throw new IllegalStateException(String.format("Variable Missing in Test Data: %s", key));
}
if (config.isTarget()) {
continue;
} else {
if (featureSet.contains(config.getColumnNum())) {
String val = getNSVariableVal(rawNsDataMap, key);
if (CommonUtils.isTreeModel(alg) && config.isCategorical()) {
Integer index = binCategoryMap.get(config.getColumnNum()).get(val == null ? "" : val);
if (index == null) {
// not in binCategories, should be missing value -1 as missing value
inputList.add(-1d);
} else {
inputList.add(index * 1d);
}
} else {
inputList.addAll(computeNumericNormResult(modelConfig, cutoff, config, val));
}
}
}
}
// god, Double [] cannot be casted to double[], toArray doesn't work
int size = inputList.size();
double[] input = new double[size];
for (int i = 0; i < size; i++) {
input[i] = inputList.get(i);
}
return new BasicMLDataPair(new BasicMLData(input), new BasicMLData(ideal));
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class NormalUtils method assembleNsDataPair.
/**
* Assemble map data to Encog standard input format. If no variable selected(noVarSel = true), all candidate
* variables will be selected.
*
* @param binCategoryMap categorical map
* @param noVarSel if after var select
* @param modelConfig model config instance
* @param columnConfigList column config list
* @param rawNsDataMap raw NSColumn data
* @param cutoff cut off value
* @param alg algorithm used in model
* @return data pair instance
* @throws NullPointerException if input is null
* @throws NumberFormatException if column value is not number format.
*/
public static MLDataPair assembleNsDataPair(Map<Integer, Map<String, Integer>> binCategoryMap, boolean noVarSel, ModelConfig modelConfig, List<ColumnConfig> columnConfigList, Map<NSColumn, String> rawNsDataMap, double cutoff, String alg) {
double[] ideal = { Constants.DEFAULT_IDEAL_VALUE };
List<Double> inputList = new ArrayList<Double>();
boolean hasCandidates = CommonUtils.hasCandidateColumns(columnConfigList);
for (ColumnConfig config : columnConfigList) {
if (config == null) {
continue;
}
NSColumn key = new NSColumn(config.getColumnName());
if (// check whole name
config.isFinalSelect() && // and then check simple name, in case user use wrong namespace
!rawNsDataMap.containsKey(key) && !rawNsDataMap.containsKey(new NSColumn(key.getSimpleName()))) {
throw new IllegalStateException(String.format("Variable Missing in Test Data: %s", key));
}
if (config.isTarget()) {
continue;
} else {
if (!noVarSel) {
if (config != null && !config.isMeta() && !config.isTarget() && config.isFinalSelect()) {
String val = getNSVariableVal(rawNsDataMap, key);
if (CommonUtils.isTreeModel(alg) && config.isCategorical()) {
Integer index = binCategoryMap.get(config.getColumnNum()).get(val == null ? "" : val);
if (index == null) {
// not in binCategories, should be missing value
// -1 as missing value
inputList.add(-1d);
} else {
inputList.add(index * 1d);
}
} else {
inputList.addAll(computeNumericNormResult(modelConfig, cutoff, config, val));
}
}
} else {
if (!config.isMeta() && !config.isTarget() && CommonUtils.isGoodCandidate(config, hasCandidates)) {
String val = getNSVariableVal(rawNsDataMap, key);
if (CommonUtils.isTreeModel(alg) && config.isCategorical()) {
Integer index = binCategoryMap.get(config.getColumnNum()).get(val == null ? "" : val);
if (index == null) {
// not in binCategories, should be missing value
// -1 as missing value
inputList.add(-1d);
} else {
inputList.add(index * 1d);
}
} else {
inputList.addAll(computeNumericNormResult(modelConfig, cutoff, config, val));
}
}
}
}
}
// god, Double [] cannot be casted to double[], toArray doesn't work
int size = inputList.size();
double[] input = new double[size];
for (int i = 0; i < size; i++) {
input[i] = inputList.get(i);
}
return new BasicMLDataPair(new BasicMLData(input), new BasicMLData(ideal));
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class VarSelUpdater method updateColumnConfig.
@Override
public void updateColumnConfig(ColumnConfig columnConfig) {
String varName = columnConfig.getColumnName();
// TODO check me: Before varselect, user can still change forceselect and force remove files while can user
// change meta and target columns???
// set column flag to null, before reset it
columnConfig.setColumnFlag(null);
// No need reset ColumnType since column type should be set well in stats and later cannot be changed
if (NSColumnUtils.isColumnEqual(this.targetColumnName, varName)) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Target);
if (CollectionUtils.isEmpty(this.modelConfig.getTags())) {
// allow tags are empty to support linear target
// set columnType to N
columnConfig.setColumnType(ColumnType.N);
} else {
// target column is set to categorical column
columnConfig.setColumnType(ColumnType.C);
}
} else if (this.setMeta.contains(new NSColumn(varName))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Meta);
} else if (this.setForceRemove.contains(new NSColumn(varName))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceRemove);
} else if (this.setForceSelect.contains(new NSColumn(varName))) {
if (CollectionUtils.isEmpty(this.setCandidates) || (// candidates
CollectionUtils.isNotEmpty(this.setCandidates) && // empty
this.setCandidates.contains(new NSColumn(varName)))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceSelect);
}
} else if (NSColumnUtils.isColumnEqual(this.weightColumnName, varName)) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Weight);
} else if (this.setCandidates.contains(new NSColumn(varName))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Candidate);
} else if (this.setCategorialColumns.contains(new NSColumn(varName))) {
columnConfig.setColumnType(ColumnType.C);
}
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class EvalNormUDF method exec.
public Tuple exec(Tuple input) throws IOException {
if (isCsvFormat) {
String firstCol = ((input.get(0) == null) ? "" : input.get(0).toString());
if (this.headers[0].equals(CommonUtils.normColumnName(firstCol))) {
// TODO what to do if the column value == column name? ...
return null;
}
}
if (this.modelRunner == null && this.isAppendScore) {
// here to initialize modelRunner, this is moved from constructor to here to avoid OOM in client side.
// UDF in pig client will be initialized to get some metadata issues
@SuppressWarnings("deprecation") List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models);
this.modelRunner.setScoreScale(Integer.parseInt(this.scale));
}
Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.headers, this.segFilterSize);
if (MapUtils.isEmpty(rawDataNsMap)) {
return null;
}
Tuple tuple = TupleFactory.getInstance().newTuple();
for (int i = 0; i < this.outputNames.size(); i++) {
String name = this.outputNames.get(i);
String raw = rawDataNsMap.get(new NSColumn(name));
if (i == 0) {
tuple.append(raw);
} else if (i == 1) {
tuple.append(StringUtils.isEmpty(raw) ? "1" : raw);
} else if (i > 1 && i < 2 + validMetaSize) {
// [2, 2 + validMetaSize) are meta columns
tuple.append(raw);
} else {
ColumnConfig columnConfig = this.columnConfigMap.get(name);
List<Double> normVals = Normalizer.normalize(columnConfig, raw, this.modelConfig.getNormalizeStdDevCutOff(), this.modelConfig.getNormalizeType());
if (this.isOutputRaw) {
tuple.append(raw);
}
for (Double normVal : normVals) {
tuple.append(getOutputValue(normVal, true));
}
}
}
if (this.isAppendScore && this.modelRunner != null) {
CaseScoreResult score = this.modelRunner.computeNsData(rawDataNsMap);
if (this.modelRunner == null || this.modelRunner.getModelsCnt() == 0 || score == null) {
tuple.append(-999.0);
} else if (this.scIndex < 0) {
tuple.append(score.getAvgScore());
} else {
tuple.append(score.getScores().get(this.scIndex));
}
}
return tuple;
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class CommonUtils method loadCandidateColumns.
public static Set<NSColumn> loadCandidateColumns(ModelConfig modelConfig) throws IOException {
Set<NSColumn> candidateColumns = new HashSet<NSColumn>();
List<String> candidates = modelConfig.getListCandidates();
for (String candidate : candidates) {
candidateColumns.add(new NSColumn(candidate));
}
return candidateColumns;
}
Aggregations