Search in sources :

Example 6 with NSColumn

use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.

the class BasicUpdater method updateColumnConfig.

public void updateColumnConfig(ColumnConfig columnConfig) {
    String varName = columnConfig.getColumnName();
    // reset flag at first
    columnConfig.setColumnFlag(null);
    if (NSColumnUtils.isColumnEqual(this.targetColumnName, varName)) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Target);
        columnConfig.setColumnType(null);
    } else if (this.setMeta.contains(new NSColumn(varName))) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Meta);
        columnConfig.setColumnType(null);
    } else if (this.setForceRemove.contains(new NSColumn(varName))) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceRemove);
    } else if (this.setForceSelect.contains(new NSColumn(varName))) {
        if (CollectionUtils.isEmpty(this.setCandidates) || (// candidates is not empty
        CollectionUtils.isNotEmpty(this.setCandidates) && this.setCandidates.contains(new NSColumn(varName)))) {
            columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceSelect);
        }
    } else if (NSColumnUtils.isColumnEqual(this.weightColumnName, varName)) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Weight);
        columnConfig.setColumnType(null);
    } else if (this.setCandidates.contains(new NSColumn(varName))) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Candidate);
    }
    if (NSColumnUtils.isColumnEqual(weightColumnName, varName)) {
        // weight column is numerical
        columnConfig.setColumnType(ColumnType.N);
    } else if (NSColumnUtils.isColumnEqual(targetColumnName, varName)) {
        if (CollectionUtils.isEmpty(this.modelConfig.getTags())) {
            // allow tags are empty to support linear target
            // set columnType to N
            columnConfig.setColumnType(ColumnType.N);
        } else {
            // target column is set to categorical column
            columnConfig.setColumnType(ColumnType.C);
        }
    } else if (setHybridColumns.contains(new NSColumn(varName))) {
        columnConfig.setColumnType(ColumnType.H);
        String newVarName = null;
        if (Environment.getBoolean(Constants.SHIFU_NAMESPACE_STRICT_MODE, false)) {
            newVarName = new NSColumn(varName).getFullColumnName();
        } else {
            newVarName = new NSColumn(varName).getSimpleName();
        }
        columnConfig.setHybridThreshold(hybridColumnNames.get(newVarName));
    } else if (setCategorialColumns.contains(new NSColumn(varName))) {
        columnConfig.setColumnType(ColumnType.C);
    } else {
        // meta and other columns are set to numerical if user not set it in categorical column configuration file
        columnConfig.setColumnType(ColumnType.N);
    }
}
Also used : NSColumn(ml.shifu.shifu.column.NSColumn)

Example 7 with NSColumn

use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.

the class TrainUpdater method updateColumnConfig.

@Override
public void updateColumnConfig(ColumnConfig columnConfig) {
    // reset flag at first
    columnConfig.setColumnFlag(null);
    String varName = columnConfig.getColumnName();
    if (NSColumnUtils.isColumnEqual(this.targetColumnName, varName)) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Target);
        if (CollectionUtils.isEmpty(this.modelConfig.getTags())) {
            // allow tags are empty to support linear target
            // set columnType to N
            columnConfig.setColumnType(ColumnType.N);
        } else {
            // target column is set to categorical column
            columnConfig.setColumnType(ColumnType.C);
        }
    } else if (this.setMeta.contains(new NSColumn(varName))) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Meta);
        // set to false is OK as if no column are selected, set to false still no one selected
        columnConfig.setFinalSelect(false);
    } else if (this.setForceRemove.contains(new NSColumn(varName))) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceRemove);
        // set to false is OK as if no column are selected, set to false still no one selected
        columnConfig.setFinalSelect(false);
    } else if (this.setForceSelect.contains(new NSColumn(varName))) {
        if (CollectionUtils.isEmpty(this.setCandidates) || (CollectionUtils.isNotEmpty(this.setCandidates) && // candidates is not empty
        this.setCandidates.contains(new NSColumn(varName)))) {
            columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceSelect);
        // WARN: should not set final select here, imagine user take varsel by SE, the first time is to call
        // training a model, then forceselected columns will be set to final selected, then all varaibles
        // selected are only in current final selected columns which is not correct.
        // There is a situation like this - after variable selection, user may want to update forselect list
        // and train the model again, if we don't set finalSelect = true, those new added variables won't be
        // used. Or user need to run variable selection again. Let's figure out a solution to fix this.
        }
    } else if (this.setCandidates.contains(new NSColumn(varName))) {
        columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Candidate);
    }
}
Also used : NSColumn(ml.shifu.shifu.column.NSColumn)

Example 8 with NSColumn

use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.

the class NormalizeUDF method exec.

@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }
    Object tag = input.get(tagColumnNum);
    if (tag == null) {
        log.warn("The tag is NULL, just skip it!!");
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    final String rawTag = CommonUtils.trimTag(tag.toString());
    // make sure all invalid tag record are filter out
    if (!isLinearTarget && !super.tagSet.contains(rawTag)) {
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    // if(!isLinearTarget && !this.isForClean) {
    if (!isLinearTarget) {
        // do data sampling. Unselected data or data with invalid tag will be filtered out.
        boolean isNotSampled = DataSampler.isNotSampled(modelConfig.isRegression(), super.tagSet, super.posTagSet, super.negTagSet, modelConfig.getNormalizeSampleRate(), modelConfig.isNormalizeSampleNegOnly(), rawTag);
        if (isNotSampled) {
            return null;
        }
    }
    // append tuple with tag, normalized value.
    Tuple tuple = TupleFactory.getInstance().newTuple();
    final NormType normType = modelConfig.getNormalizeType();
    Map<String, Object> compactVarMap = null;
    if (this.isCompactNorm) {
        compactVarMap = new HashMap<String, Object>();
    }
    if (!this.isForExpressions) {
        if (input.size() != this.columnConfigList.size()) {
            this.mismatchCnt++;
            log.error("the input size - " + input.size() + ", while column size - " + columnConfigList.size());
            this.mismatchCnt++;
            // this could make Shifu could skip some malformed data
            if (this.mismatchCnt > MAX_MISMATCH_CNT) {
                throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
            }
            return null;
        }
        for (int i = 0; i < input.size(); i++) {
            ColumnConfig config = columnConfigList.get(i);
            String val = (input.get(i) == null) ? "" : input.get(i).toString();
            // load variables for weight calculating.
            if (weightExpr != null) {
                weightContext.set(new NSColumn(config.getColumnName()).getSimpleName(), val);
            }
            // check tag type.
            if (tagColumnNum == i) {
                if (modelConfig.isRegression()) {
                    int type = 0;
                    if (super.posTagSet.contains(rawTag)) {
                        type = 1;
                    } else if (super.negTagSet.contains(rawTag)) {
                        type = 0;
                    } else {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
                    } else {
                        tuple.append(type);
                    }
                } else if (this.isLinearTarget) {
                    double tagValue = 0.0;
                    try {
                        tagValue = Double.parseDouble(rawTag);
                    } catch (Exception e) {
                        log.error("Tag - " + rawTag + " is invalid(not numerical). Skip record.");
                        // skip this line
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), tagValue);
                    } else {
                        tuple.append(tagValue);
                    }
                } else {
                    int index = -1;
                    for (int j = 0; j < tags.size(); j++) {
                        Set<String> tagSet = tags.get(j);
                        if (tagSet.contains(rawTag)) {
                            index = j;
                            break;
                        }
                    }
                    if (index == -1) {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
                    } else {
                        tuple.append(index);
                    }
                }
                continue;
            }
            if (this.isForClean) {
                // for RF/GBT model, only clean data, not real do norm data
                if (config.isCategorical()) {
                    Map<String, Integer> map = this.categoricalIndexMap.get(config.getColumnNum());
                    // map should not be null, no need check if map is null, if val not in binCategory, set it to ""
                    tuple.append(((map.get(val) == null || map.get(val) == -1)) ? "" : val);
                } else {
                    Double normVal = 0d;
                    try {
                        normVal = Double.parseDouble(val);
                    } catch (Exception e) {
                        log.debug("Not decimal format " + val + ", using default!");
                        normVal = Normalizer.defaultMissingValue(config);
                    }
                    appendOutputValue(tuple, normVal, true);
                }
            } else {
                if (this.isCompactNorm) {
                    // only output features and target, weight in compact norm mode
                    if (!config.isMeta() && config.isFinalSelect()) {
                        // for multiple classification, binPosRate means rate of such category over all counts,
                        // reuse binPosRate for normalize
                        List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                        for (Double normVal : normVals) {
                            String formatVal = getOutputValue(normVal, true);
                            compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
                        }
                    } else if (config.isMeta()) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
                    } else {
                    // if is compact mode but such column is not final selected, should be empty, as only append
                    // target and finalSelect feature, no need append here so this code block is empty. TODO, do
                    // we need meta column?
                    }
                } else {
                    // it will cause variable fail to normalize
                    if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
                        // for multiple classification, binPosRate means rate of such category over all counts,
                        // reuse binPosRate for normalize
                        List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                        for (Double normVal : normVals) {
                            appendOutputValue(tuple, normVal, true);
                        }
                    } else {
                        tuple.append(config.isMeta() ? val : null);
                    }
                }
            }
        }
    } else {
        // for segment expansion variables
        int rawSize = input.size();
        for (int i = 0; i < this.columnConfigList.size(); i++) {
            ColumnConfig config = this.columnConfigList.get(i);
            int newIndex = i >= rawSize ? i % rawSize : i;
            String val = (input.get(newIndex) == null) ? "" : input.get(newIndex).toString();
            // for target column
            if (config.isTarget()) {
                if (modelConfig.isRegression()) {
                    int type = 0;
                    if (super.posTagSet.contains(rawTag)) {
                        type = 1;
                    } else if (super.negTagSet.contains(rawTag)) {
                        type = 0;
                    } else {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
                    } else {
                        tuple.append(type);
                    }
                } else {
                    int index = -1;
                    for (int j = 0; j < tags.size(); j++) {
                        Set<String> tagSet = tags.get(j);
                        if (tagSet.contains(rawTag)) {
                            index = j;
                            break;
                        }
                    }
                    if (index == -1) {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
                    } else {
                        tuple.append(index);
                    }
                }
                continue;
            }
            if (this.isCompactNorm) {
                // only output features and target, weight in compact norm mode
                if (!config.isMeta() && config.isFinalSelect()) {
                    // for multiple classification, binPosRate means rate of such category over all counts,
                    // reuse binPosRate for normalize
                    List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                    for (Double normVal : normVals) {
                        String formatVal = getOutputValue(normVal, true);
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
                    }
                } else if (config.isMeta()) {
                    compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
                } else {
                // if is compact mode but such column is not final selected, should be empty, as only append
                // target and finalSelect feature, no need append here so this code block is empty. TODO, do
                // we need meta column?
                }
            } else {
                // for others
                if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
                    List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                    for (Double normVal : normVals) {
                        appendOutputValue(tuple, normVal, true);
                    }
                } else {
                    tuple.append(config.isMeta() ? val : null);
                }
            }
        }
    }
    // for compact norm mode, output to tuple at here
    if (this.isCompactNorm) {
        for (int i = 0; i < outputCompactColumns.size(); i++) {
            tuple.append(compactVarMap.get(outputCompactColumns.get(i)));
        }
    }
    // append tuple with weight.
    double weight = evaluateWeight(weightExpr, weightContext);
    tuple.append(weight);
    return tuple;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) NormType(ml.shifu.shifu.container.obj.ModelNormalizeConf.NormType) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException) ShifuException(ml.shifu.shifu.exception.ShifuException) Tuple(org.apache.pig.data.Tuple) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 9 with NSColumn

use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.

the class SimpleScoreUDF method exec.

public Double exec(Tuple input) throws IOException {
    Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.header, 0);
    CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap);
    if (cs == null) {
        log.error("Get null result.");
        return null;
    }
    String tag = CommonUtils.trimTag(rawDataNsMap.get(new NSColumn(targetColumnName)));
    if (!(negTags.contains(tag) || posTags.contains(tag))) {
        // invalid record
        log.error("Detected invalid record. Its tag is - " + tag);
        return null;
    }
    return cs.getAvgScore();
}
Also used : CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 10 with NSColumn

use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.

the class VarSelectModelProcessor method postProcess4SEVarSelect.

private void postProcess4SEVarSelect(SourceType source, String varSelectMSEOutputPath) throws IOException {
    String outputFilePattern = varSelectMSEOutputPath + Path.SEPARATOR + "part-r-*";
    if (!ShifuFileUtils.isFileExists(outputFilePattern, source)) {
        throw new RuntimeException("Var select MSE stats output file not exist.");
    }
    int selectCnt = 0;
    for (ColumnConfig config : super.columnConfigList) {
        if (config.isFinalSelect()) {
            config.setFinalSelect(false);
        }
        // enable ForceSelect
        if (config.isForceSelect()) {
            config.setFinalSelect(true);
            selectCnt++;
            log.info("Variable {} is selected, since it is in ForceSelect list.", config.getColumnName());
        }
    }
    Set<NSColumn> userCandidateColumns = CommonUtils.loadCandidateColumns(modelConfig);
    List<Scanner> scanners = null;
    try {
        // here only works for 1 reducer
        FileStatus[] globStatus = ShifuFileUtils.getFileSystemBySourceType(source).globStatus(new Path(outputFilePattern));
        if (globStatus == null || globStatus.length == 0) {
            throw new RuntimeException("Var select MSE stats output file not exist.");
        }
        scanners = ShifuFileUtils.getDataScanners(globStatus[0].getPath().toString(), source);
        String str = null;
        // total variable count that user want to select
        int targetCnt = 0;
        List<Integer> candidateColumnIdList = new ArrayList<Integer>();
        Scanner scanner = scanners.get(0);
        while (scanner.hasNext()) {
            ++targetCnt;
            str = scanner.nextLine().trim();
            candidateColumnIdList.add(Integer.parseInt(str));
        }
        int i = 0;
        int candidateCount = candidateColumnIdList.size();
        // force-selected variables
        while (selectCnt < targetCnt && i < targetCnt) {
            if (i >= candidateCount) {
                log.warn("Var select finish due candidate column {} is less than target var count {}", candidateCount, targetCnt);
                break;
            }
            Integer columnId = candidateColumnIdList.get(i++);
            // after supporting segments, the columns will expansion. the columnId may not the position
            // in columnConfigList. It's safe to columnId to search (make sure columnNum == columnId)
            ColumnConfig columnConfig = CommonUtils.getColumnConfig(this.columnConfigList, columnId);
            if (CollectionUtils.isNotEmpty(userCandidateColumns) && !userCandidateColumns.contains(new NSColumn(columnConfig.getColumnName()))) {
                log.info("Variable {} is not in user's candidate list. Skip it.", columnConfig.getColumnName());
            } else if (!columnConfig.isForceSelect() && !columnConfig.isForceRemove()) {
                columnConfig.setFinalSelect(true);
                selectCnt++;
                log.info("Variable {} is selected.", columnConfig.getColumnName());
            }
        }
        log.info("{} variables are selected.", selectCnt);
        log.info("Sensitivity analysis report is in {}/{}-* file(s) with format 'column_index\tcolumn_name\tmean\trms\tvariance'.", varSelectMSEOutputPath, Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME);
        this.seStatsMap = readSEValuesToMap(varSelectMSEOutputPath + Path.SEPARATOR + Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME + "-*", source);
    } finally {
        if (scanners != null) {
            for (Scanner scanner : scanners) {
                if (scanner != null) {
                    scanner.close();
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) NSColumn(ml.shifu.shifu.column.NSColumn)

Aggregations

NSColumn (ml.shifu.shifu.column.NSColumn)17 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)8 CaseScoreResult (ml.shifu.shifu.container.CaseScoreResult)4 Tuple (org.apache.pig.data.Tuple)4 BasicML (org.encog.ml.BasicML)3 ModelRunner (ml.shifu.shifu.core.ModelRunner)2 ModelSpec (ml.shifu.shifu.core.model.ModelSpec)2 BasicMLData (org.encog.ml.data.basic.BasicMLData)2 BasicMLDataPair (org.encog.ml.data.basic.BasicMLDataPair)2 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Entry (java.util.Map.Entry)1 SortedMap (java.util.SortedMap)1 ColumnConfigComparator (ml.shifu.shifu.container.obj.ColumnConfig.ColumnConfigComparator)1 NormType (ml.shifu.shifu.container.obj.ModelNormalizeConf.NormType)1 ShifuException (ml.shifu.shifu.exception.ShifuException)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 Path (org.apache.hadoop.fs.Path)1