Search in sources :

Example 1 with VarSelDesc

use of ml.shifu.shifu.core.history.VarSelDesc in project shifu by ShifuML.

the class VarSelectModelProcessor method runAutoVarFilter.

/**
 * @throws IOException
 */
private void runAutoVarFilter() throws IOException {
    if (this.modelConfig.getVarSelect().getPostCorrelationMetric().equals(PostCorrelationMetric.SE) && this.seStatsMap == null) {
        SourceType source = this.modelConfig.getDataSet().getSource();
        String varSelectMSEOutputPath = super.getPathFinder().getVarSelectMSEOutputPath(source);
        this.seStatsMap = readSEValuesToMap(varSelectMSEOutputPath + Path.SEPARATOR + Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME + "-*", source);
    }
    List<VarSelDesc> varSelDescList = new ArrayList<VarSelDesc>();
    autoVarSelCondition(varSelDescList);
    if (CollectionUtils.isNotEmpty(varSelDescList)) {
        String varselHistory = this.pathFinder.getVarSelHistory();
        ShifuFileUtils.writeLines(varSelDescList, varselHistory, SourceType.LOCAL);
    }
}
Also used : VarSelDesc(ml.shifu.shifu.core.history.VarSelDesc) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType)

Example 2 with VarSelDesc

use of ml.shifu.shifu.core.history.VarSelDesc in project shifu by ShifuML.

the class VarSelectModelProcessor method loadVarSelDescList.

/**
 * Load variable selection history file into VarSelDesc
 *
 * @param varselHistory
 *            - variable selection history file file
 * @return
 * @throws IOException
 */
private List<VarSelDesc> loadVarSelDescList(String varselHistory) throws IOException {
    Reader reader = ShifuFileUtils.getReader(varselHistory, SourceType.LOCAL);
    List<String> autoFilterList = IOUtils.readLines(reader);
    IOUtils.closeQuietly(reader);
    List<VarSelDesc> varSelDescList = new ArrayList<VarSelDesc>();
    for (String filterDesc : autoFilterList) {
        VarSelDesc varSelDesc = VarSelDesc.fromString(filterDesc);
        if (varSelDesc != null) {
            varSelDescList.add(varSelDesc);
        }
    }
    return varSelDescList;
}
Also used : VarSelDesc(ml.shifu.shifu.core.history.VarSelDesc) Reader(java.io.Reader) BufferedReader(java.io.BufferedReader)

Example 3 with VarSelDesc

use of ml.shifu.shifu.core.history.VarSelDesc in project shifu by ShifuML.

the class VarSelectModelProcessor method autoVarSelCondition.

/**
 * To do some auto variable selection like remove ID-like variables, remove variable with high missing rate.
 *
 * @throws IOException
 *             any IO exception
 */
private void autoVarSelCondition(List<VarSelDesc> varSelDescList) throws IOException {
    // 1. check missing rate
    for (ColumnConfig config : columnConfigList) {
        if (// column needs check
        !config.isTarget() && !config.isMeta() && !config.isForceSelect() && config.isFinalSelect() && isHighMissingRateColumn(config)) {
            log.warn("Column {} is with very high missing rate, set final select to false. " + "If not, you can check it manually in ColumnConfig.json", config.getColumnName());
            config.setFinalSelect(false);
            varSelDescList.add(new VarSelDesc(config, VarSelReason.HIGH_MISSING_RATE));
        }
    }
    // 2. check KS and IV min threshold value
    for (ColumnConfig config : columnConfigList) {
        if (!config.isTarget() && !config.isMeta() && !config.isForceSelect() && config.isFinalSelect()) {
            float minIvThreshold = (super.modelConfig.getVarSelect().getMinIvThreshold() == null ? 0f : super.modelConfig.getVarSelect().getMinIvThreshold());
            if (config.getIv() != null && config.getIv() < minIvThreshold) {
                log.warn("IV of column {} is less than minimal IV threshold, set final select to false. " + "If not, you can check it manually in ColumnConfig.json", config.getColumnName());
                config.setFinalSelect(false);
                varSelDescList.add(new VarSelDesc(config, VarSelReason.IV_TOO_LOW));
            }
            float minKsThreshold = (super.modelConfig.getVarSelect().getMinKsThreshold() == null ? 0f : super.modelConfig.getVarSelect().getMinKsThreshold());
            if (config.getKs() != null && config.getKs() < minKsThreshold) {
                log.warn("KS of column {} is less than minimal KS threshold, set final select to false. " + "If not, you can check it manually in ColumnConfig.json", config.getColumnName());
                config.setFinalSelect(false);
                varSelDescList.add(new VarSelDesc(config, VarSelReason.KS_TOO_LOW));
            }
        }
    }
    // 3. check correlation value:
    if (!ShifuFileUtils.isFileExists(pathFinder.getLocalCorrelationCsvPath(), SourceType.LOCAL)) {
        return;
    }
    varSelectByCorrelation(varSelDescList);
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) VarSelDesc(ml.shifu.shifu.core.history.VarSelDesc)

Example 4 with VarSelDesc

use of ml.shifu.shifu.core.history.VarSelDesc in project shifu by ShifuML.

the class VarSelectModelProcessor method recoverVarselStatusFromHist.

/**
 * Recover auto-filtered variable status from varsel history file
 *
 * @param varselHistory
 *            - variable selection history file
 * @throws IOException
 */
private void recoverVarselStatusFromHist(String varselHistory) throws IOException {
    List<VarSelDesc> varSelDescList = loadVarSelDescList(varselHistory);
    for (VarSelDesc varSelDesc : varSelDescList) {
        ColumnConfig columnConfig = this.columnConfigList.get(varSelDesc.getColumnId());
        if (columnConfig.isFinalSelect() == varSelDesc.getNewSelStatus()) {
            log.info("Recover column - {} from {} to {}", varSelDesc.getColumnName(), varSelDesc.getNewSelStatus(), varSelDesc.getOldSelStatus());
            columnConfig.setFinalSelect(varSelDesc.getOldSelStatus());
        }
    }
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) VarSelDesc(ml.shifu.shifu.core.history.VarSelDesc)

Example 5 with VarSelDesc

use of ml.shifu.shifu.core.history.VarSelDesc in project shifu by ShifuML.

the class VarSelectModelProcessor method varSelectByCorrelation.

// TODO refactor me please, bad function
private void varSelectByCorrelation(List<VarSelDesc> varSelDescList) throws IOException {
    BufferedReader reader = ShifuFileUtils.getReader(pathFinder.getLocalCorrelationCsvPath(), SourceType.LOCAL);
    int lineNum = 0;
    try {
        String line = null;
        while ((line = reader.readLine()) != null) {
            lineNum += 1;
            if (lineNum <= 2) {
                // skip first 2 lines which are indexes and names
                continue;
            }
            String[] columns = CommonUtils.split(line, ",");
            if (columns != null && columns.length == columnConfigList.size() + 2) {
                int columnIndex = Integer.parseInt(columns[0].trim());
                ColumnConfig config = this.columnConfigList.get(columnIndex);
                // only check final-selected non-meta columns
                if (config.isFinalSelect() || config.isTarget()) {
                    double[] corrArray = getCorrArray(columns);
                    for (int i = 0; i < corrArray.length; i++) {
                        // only check column larger than current column index and already final selected
                        if (config.getColumnNum() < i && (columnConfigList.get(i).isTarget() || columnConfigList.get(i).isFinalSelect())) {
                            // * 1.000005d is to avoid some value like 1.0000000002 in correlation value
                            if (Math.abs(corrArray[i]) > (modelConfig.getVarSelect().getCorrelationThreshold() * 1.000005d)) {
                                if (config.isForceSelect() && columnConfigList.get(i).isForceSelect()) {
                                    log.warn("{} and {} has high correlated value but both not to be removed because both are force-selected", columnIndex, i);
                                } else if (config.isForceSelect() && !columnConfigList.get(i).isForceSelect()) {
                                    log.warn("Absolute correlation value {} in column pair ({}, {}) ({}, {}) are larger than correlationThreshold value {} set in VarSelect#correlationThreshold, column {} name {} is not force-selected will not be selected, set finalSelect to false.", config.getColumnName(), columnConfigList.get(i).getColumnName(), modelConfig.getVarSelect().getCorrelationThreshold(), columnConfigList.get(i).getColumnNum(), columnConfigList.get(i).getColumnName());
                                    columnConfigList.get(i).setFinalSelect(false);
                                    varSelDescList.add(new VarSelDesc(columnConfigList.get(i), VarSelReason.HIGH_CORRELATED));
                                } else if (!config.isForceSelect() && columnConfigList.get(i).isForceSelect()) {
                                    log.warn("Absolute correlation value {} in column pair ({}, {}) ({}, {}) are larger than correlationThreshold value {} set in VarSelect#correlationThreshold, column {} name {} is not force-selected will not be selected, set finalSelect to false.", config.getColumnName(), columnConfigList.get(i).getColumnName(), modelConfig.getVarSelect().getCorrelationThreshold(), config.getColumnNum(), config.getColumnName());
                                    config.setFinalSelect(false);
                                    varSelDescList.add(new VarSelDesc(config, VarSelReason.HIGH_CORRELATED));
                                } else if (config.isTarget() && columnConfigList.get(i).isFinalSelect()) {
                                    log.warn("{} and {} has high correlated value while {} is target, {} is set to NOT final-selected no matter it is force-selected or not.", columnIndex, i, i);
                                    columnConfigList.get(i).setFinalSelect(false);
                                } else if (config.isFinalSelect() && columnConfigList.get(i).isTarget()) {
                                    log.warn("{} and {} has high correlated value while {} is target, {} is set to NOT final-selected no matter it is force-selected or not.", columnIndex, i, columnIndex);
                                    config.setFinalSelect(false);
                                    varSelDescList.add(new VarSelDesc(config, VarSelReason.HIGH_CORRELATED));
                                } else {
                                    // both columns are not target and all final selected
                                    ColumnConfig dropConfig = null;
                                    PostCorrelationMetric corrMetric = modelConfig.getVarSelect().getPostCorrelationMetric();
                                    if (checkCorrelationMetric(config, columnConfigList.get(i), corrMetric)) {
                                        dropConfig = columnConfigList.get(i);
                                    } else {
                                        dropConfig = config;
                                    }
                                    // correlation comparison by SE RMS value
                                    if ((this.modelConfig.getVarSelectFilterBy().equalsIgnoreCase(Constants.FILTER_BY_SE) || this.modelConfig.getVarSelectFilterBy().equalsIgnoreCase(Constants.FILTER_BY_ST)) && corrMetric == PostCorrelationMetric.SE && this.seStatsMap != null && this.seStatsMap.get(config.getColumnNum()) != null && this.seStatsMap.get(columnConfigList.get(i).getColumnNum()) != null) {
                                        log.warn("Absolute correlation value {} in column pair ({}, {}) ({}, {}) are larger than correlationThreshold value {} set in VarSelect#correlationThreshold, column {} name {} with smaller SE RMS value will not be selected, set finalSelect to false.", Math.abs(corrArray[i]), config.getColumnNum(), i, config.getColumnName(), columnConfigList.get(i).getColumnName(), modelConfig.getVarSelect().getCorrelationThreshold(), dropConfig.getColumnNum(), dropConfig.getColumnName());
                                    } else {
                                        log.info("Absolute correlation value {} in column pair ({}, {}) ({}, {}) are larger than correlationThreshold value {} set in VarSelect#correlationThreshold, column {} name {} with smaller {} value will not be selected, set finalSelect to false.", Math.abs(corrArray[i]), config.getColumnNum(), i, config.getColumnName(), columnConfigList.get(i).getColumnName(), modelConfig.getVarSelect().getCorrelationThreshold(), dropConfig.getColumnNum(), dropConfig.getColumnName(), corrMetric);
                                    }
                                    // de-select column which is dropped in current logic
                                    dropConfig.setFinalSelect(false);
                                    varSelDescList.add(new VarSelDesc(dropConfig, VarSelReason.HIGH_CORRELATED));
                                }
                            }
                        }
                    }
                }
            }
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) VarSelDesc(ml.shifu.shifu.core.history.VarSelDesc) BufferedReader(java.io.BufferedReader) PostCorrelationMetric(ml.shifu.shifu.container.obj.ModelVarSelectConf.PostCorrelationMetric)

Aggregations

VarSelDesc (ml.shifu.shifu.core.history.VarSelDesc)5 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)3 BufferedReader (java.io.BufferedReader)2 Reader (java.io.Reader)1 PostCorrelationMetric (ml.shifu.shifu.container.obj.ModelVarSelectConf.PostCorrelationMetric)1 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)1