Search in sources :

Example 11 with NSColumn

use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.

the class NormalUtils method assembleNsDataPair.

/**
 * Assemble map data to Encog standard input format. If no variable selected(noVarSel = true), all candidate
 * variables will be selected.
 *
 * @param binCategoryMap   categorical map
 * @param noVarSel         if after var select
 * @param modelConfig      model config instance
 * @param columnConfigList column config list
 * @param rawNsDataMap     raw NSColumn data
 * @param cutoff           cut off value
 * @param alg              algorithm used in model
 * @param featureSet       feature set used in NN model
 * @return data pair instance
 * @throws NullPointerException  if input is null
 * @throws NumberFormatException if column value is not number format.
 */
public static MLDataPair assembleNsDataPair(Map<Integer, Map<String, Integer>> binCategoryMap, boolean noVarSel, ModelConfig modelConfig, List<ColumnConfig> columnConfigList, Map<NSColumn, String> rawNsDataMap, double cutoff, String alg, Set<Integer> featureSet) {
    if (CollectionUtils.isEmpty(featureSet)) {
        return assembleNsDataPair(binCategoryMap, noVarSel, modelConfig, columnConfigList, rawNsDataMap, cutoff, alg);
    }
    double[] ideal = { Constants.DEFAULT_IDEAL_VALUE };
    List<Double> inputList = new ArrayList<Double>();
    for (ColumnConfig config : columnConfigList) {
        if (config == null) {
            continue;
        }
        NSColumn key = new NSColumn(config.getColumnName());
        if (// check whole name
        config.isFinalSelect() && // and then check simple name, in case user use wrong namespace
        !rawNsDataMap.containsKey(key) && !rawNsDataMap.containsKey(new NSColumn(key.getSimpleName()))) {
            throw new IllegalStateException(String.format("Variable Missing in Test Data: %s", key));
        }
        if (config.isTarget()) {
            continue;
        } else {
            if (featureSet.contains(config.getColumnNum())) {
                String val = getNSVariableVal(rawNsDataMap, key);
                if (CommonUtils.isTreeModel(alg) && config.isCategorical()) {
                    Integer index = binCategoryMap.get(config.getColumnNum()).get(val == null ? "" : val);
                    if (index == null) {
                        // not in binCategories, should be missing value -1 as missing value
                        inputList.add(-1d);
                    } else {
                        inputList.add(index * 1d);
                    }
                } else {
                    inputList.addAll(computeNumericNormResult(modelConfig, cutoff, config, val));
                }
            }
        }
    }
    // god, Double [] cannot be casted to double[], toArray doesn't work
    int size = inputList.size();
    double[] input = new double[size];
    for (int i = 0; i < size; i++) {
        input[i] = inputList.get(i);
    }
    return new BasicMLDataPair(new BasicMLData(input), new BasicMLData(ideal));
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) BasicMLDataPair(org.encog.ml.data.basic.BasicMLDataPair) BasicMLData(org.encog.ml.data.basic.BasicMLData) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 12 with NSColumn

use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.

the class NormalUtils method assembleNsDataPair.

/**
 * Assemble map data to Encog standard input format. If no variable selected(noVarSel = true), all candidate
 * variables will be selected.
 *
 * @param binCategoryMap   categorical map
 * @param noVarSel         if after var select
 * @param modelConfig      model config instance
 * @param columnConfigList column config list
 * @param rawNsDataMap     raw NSColumn data
 * @param cutoff           cut off value
 * @param alg              algorithm used in model
 * @return data pair instance
 * @throws NullPointerException  if input is null
 * @throws NumberFormatException if column value is not number format.
 */
public static MLDataPair assembleNsDataPair(Map<Integer, Map<String, Integer>> binCategoryMap, boolean noVarSel, ModelConfig modelConfig, List<ColumnConfig> columnConfigList, Map<NSColumn, String> rawNsDataMap, double cutoff, String alg) {
    double[] ideal = { Constants.DEFAULT_IDEAL_VALUE };
    List<Double> inputList = new ArrayList<Double>();
    boolean hasCandidates = CommonUtils.hasCandidateColumns(columnConfigList);
    for (ColumnConfig config : columnConfigList) {
        if (config == null) {
            continue;
        }
        NSColumn key = new NSColumn(config.getColumnName());
        if (// check whole name
        config.isFinalSelect() && // and then check simple name, in case user use wrong namespace
        !rawNsDataMap.containsKey(key) && !rawNsDataMap.containsKey(new NSColumn(key.getSimpleName()))) {
            throw new IllegalStateException(String.format("Variable Missing in Test Data: %s", key));
        }
        if (config.isTarget()) {
            continue;
        } else {
            if (!noVarSel) {
                if (config != null && !config.isMeta() && !config.isTarget() && config.isFinalSelect()) {
                    String val = getNSVariableVal(rawNsDataMap, key);
                    if (CommonUtils.isTreeModel(alg) && config.isCategorical()) {
                        Integer index = binCategoryMap.get(config.getColumnNum()).get(val == null ? "" : val);
                        if (index == null) {
                            // not in binCategories, should be missing value
                            // -1 as missing value
                            inputList.add(-1d);
                        } else {
                            inputList.add(index * 1d);
                        }
                    } else {
                        inputList.addAll(computeNumericNormResult(modelConfig, cutoff, config, val));
                    }
                }
            } else {
                if (!config.isMeta() && !config.isTarget() && CommonUtils.isGoodCandidate(config, hasCandidates)) {
                    String val = getNSVariableVal(rawNsDataMap, key);
                    if (CommonUtils.isTreeModel(alg) && config.isCategorical()) {
                        Integer index = binCategoryMap.get(config.getColumnNum()).get(val == null ? "" : val);
                        if (index == null) {
                            // not in binCategories, should be missing value
                            // -1 as missing value
                            inputList.add(-1d);
                        } else {
                            inputList.add(index * 1d);
                        }
                    } else {
                        inputList.addAll(computeNumericNormResult(modelConfig, cutoff, config, val));
                    }
                }
            }
        }
    }
    // god, Double [] cannot be casted to double[], toArray doesn't work
    int size = inputList.size();
    double[] input = new double[size];
    for (int i = 0; i < size; i++) {
        input[i] = inputList.get(i);
    }
    return new BasicMLDataPair(new BasicMLData(input), new BasicMLData(ideal));
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) BasicMLDataPair(org.encog.ml.data.basic.BasicMLDataPair) BasicMLData(org.encog.ml.data.basic.BasicMLData) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 13 with NSColumn

use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.

the class VarSelUpdater method updateColumnConfig.

@Override
public void updateColumnConfig(ColumnConfig columnConfig) {
    String varName = columnConfig.getColumnName();
    // TODO check me: Before varselect, user can still change forceselect and force remove files while can user
    // change meta and target columns???
    // set column flag to null, before reset it
    columnConfig.setColumnFlag(null);
    // No need reset ColumnType since column type should be set well in stats and later cannot be changed
    if (NSColumnUtils.isColumnEqual(this.targetColumnName, varName)) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Target);
        if (CollectionUtils.isEmpty(this.modelConfig.getTags())) {
            // allow tags are empty to support linear target
            // set columnType to N
            columnConfig.setColumnType(ColumnType.N);
        } else {
            // target column is set to categorical column
            columnConfig.setColumnType(ColumnType.C);
        }
    } else if (this.setMeta.contains(new NSColumn(varName))) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Meta);
    } else if (this.setForceRemove.contains(new NSColumn(varName))) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceRemove);
    } else if (this.setForceSelect.contains(new NSColumn(varName))) {
        if (CollectionUtils.isEmpty(this.setCandidates) || (// candidates
        CollectionUtils.isNotEmpty(this.setCandidates) && // empty
        this.setCandidates.contains(new NSColumn(varName)))) {
            columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceSelect);
        }
    } else if (NSColumnUtils.isColumnEqual(this.weightColumnName, varName)) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Weight);
    } else if (this.setCandidates.contains(new NSColumn(varName))) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Candidate);
    } else if (this.setCategorialColumns.contains(new NSColumn(varName))) {
        columnConfig.setColumnType(ColumnType.C);
    }
}
Also used : NSColumn(ml.shifu.shifu.column.NSColumn)

Example 14 with NSColumn

use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.

the class EvalNormUDF method exec.

public Tuple exec(Tuple input) throws IOException {
    if (isCsvFormat) {
        String firstCol = ((input.get(0) == null) ? "" : input.get(0).toString());
        if (this.headers[0].equals(CommonUtils.normColumnName(firstCol))) {
            // TODO what to do if the column value == column name? ...
            return null;
        }
    }
    if (this.modelRunner == null && this.isAppendScore) {
        // here to initialize modelRunner, this is moved from constructor to here to avoid OOM in client side.
        // UDF in pig client will be initialized to get some metadata issues
        @SuppressWarnings("deprecation") List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
        this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models);
        this.modelRunner.setScoreScale(Integer.parseInt(this.scale));
    }
    Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.headers, this.segFilterSize);
    if (MapUtils.isEmpty(rawDataNsMap)) {
        return null;
    }
    Tuple tuple = TupleFactory.getInstance().newTuple();
    for (int i = 0; i < this.outputNames.size(); i++) {
        String name = this.outputNames.get(i);
        String raw = rawDataNsMap.get(new NSColumn(name));
        if (i == 0) {
            tuple.append(raw);
        } else if (i == 1) {
            tuple.append(StringUtils.isEmpty(raw) ? "1" : raw);
        } else if (i > 1 && i < 2 + validMetaSize) {
            // [2, 2 + validMetaSize) are meta columns
            tuple.append(raw);
        } else {
            ColumnConfig columnConfig = this.columnConfigMap.get(name);
            List<Double> normVals = Normalizer.normalize(columnConfig, raw, this.modelConfig.getNormalizeStdDevCutOff(), this.modelConfig.getNormalizeType());
            if (this.isOutputRaw) {
                tuple.append(raw);
            }
            for (Double normVal : normVals) {
                tuple.append(getOutputValue(normVal, true));
            }
        }
    }
    if (this.isAppendScore && this.modelRunner != null) {
        CaseScoreResult score = this.modelRunner.computeNsData(rawDataNsMap);
        if (this.modelRunner == null || this.modelRunner.getModelsCnt() == 0 || score == null) {
            tuple.append(-999.0);
        } else if (this.scIndex < 0) {
            tuple.append(score.getAvgScore());
        } else {
            tuple.append(score.getScores().get(this.scIndex));
        }
    }
    return tuple;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) BasicML(org.encog.ml.BasicML) CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) Tuple(org.apache.pig.data.Tuple) ModelRunner(ml.shifu.shifu.core.ModelRunner) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 15 with NSColumn

use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.

the class CommonUtils method loadCandidateColumns.

public static Set<NSColumn> loadCandidateColumns(ModelConfig modelConfig) throws IOException {
    Set<NSColumn> candidateColumns = new HashSet<NSColumn>();
    List<String> candidates = modelConfig.getListCandidates();
    for (String candidate : candidates) {
        candidateColumns.add(new NSColumn(candidate));
    }
    return candidateColumns;
}
Also used : NSColumn(ml.shifu.shifu.column.NSColumn)

Aggregations

NSColumn (ml.shifu.shifu.column.NSColumn)17 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)8 CaseScoreResult (ml.shifu.shifu.container.CaseScoreResult)4 Tuple (org.apache.pig.data.Tuple)4 BasicML (org.encog.ml.BasicML)3 ModelRunner (ml.shifu.shifu.core.ModelRunner)2 ModelSpec (ml.shifu.shifu.core.model.ModelSpec)2 BasicMLData (org.encog.ml.data.basic.BasicMLData)2 BasicMLDataPair (org.encog.ml.data.basic.BasicMLDataPair)2 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Entry (java.util.Map.Entry)1 SortedMap (java.util.SortedMap)1 ColumnConfigComparator (ml.shifu.shifu.container.obj.ColumnConfig.ColumnConfigComparator)1 NormType (ml.shifu.shifu.container.obj.ModelNormalizeConf.NormType)1 ShifuException (ml.shifu.shifu.exception.ShifuException)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 Path (org.apache.hadoop.fs.Path)1