use of ml.shifu.shifu.core.history.VarSelDesc in project shifu by ShifuML.
the class VarSelectModelProcessor method runAutoVarFilter.
/**
* @throws IOException
*/
private void runAutoVarFilter() throws IOException {
if (this.modelConfig.getVarSelect().getPostCorrelationMetric().equals(PostCorrelationMetric.SE) && this.seStatsMap == null) {
SourceType source = this.modelConfig.getDataSet().getSource();
String varSelectMSEOutputPath = super.getPathFinder().getVarSelectMSEOutputPath(source);
this.seStatsMap = readSEValuesToMap(varSelectMSEOutputPath + Path.SEPARATOR + Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME + "-*", source);
}
List<VarSelDesc> varSelDescList = new ArrayList<VarSelDesc>();
autoVarSelCondition(varSelDescList);
if (CollectionUtils.isNotEmpty(varSelDescList)) {
String varselHistory = this.pathFinder.getVarSelHistory();
ShifuFileUtils.writeLines(varSelDescList, varselHistory, SourceType.LOCAL);
}
}
use of ml.shifu.shifu.core.history.VarSelDesc in project shifu by ShifuML.
the class VarSelectModelProcessor method loadVarSelDescList.
/**
* Load variable selection history file into VarSelDesc
*
* @param varselHistory
* - variable selection history file file
* @return
* @throws IOException
*/
private List<VarSelDesc> loadVarSelDescList(String varselHistory) throws IOException {
Reader reader = ShifuFileUtils.getReader(varselHistory, SourceType.LOCAL);
List<String> autoFilterList = IOUtils.readLines(reader);
IOUtils.closeQuietly(reader);
List<VarSelDesc> varSelDescList = new ArrayList<VarSelDesc>();
for (String filterDesc : autoFilterList) {
VarSelDesc varSelDesc = VarSelDesc.fromString(filterDesc);
if (varSelDesc != null) {
varSelDescList.add(varSelDesc);
}
}
return varSelDescList;
}
use of ml.shifu.shifu.core.history.VarSelDesc in project shifu by ShifuML.
the class VarSelectModelProcessor method autoVarSelCondition.
/**
* To do some auto variable selection like remove ID-like variables, remove variable with high missing rate.
*
* @throws IOException
* any IO exception
*/
private void autoVarSelCondition(List<VarSelDesc> varSelDescList) throws IOException {
// 1. check missing rate
for (ColumnConfig config : columnConfigList) {
if (// column needs check
!config.isTarget() && !config.isMeta() && !config.isForceSelect() && config.isFinalSelect() && isHighMissingRateColumn(config)) {
log.warn("Column {} is with very high missing rate, set final select to false. " + "If not, you can check it manually in ColumnConfig.json", config.getColumnName());
config.setFinalSelect(false);
varSelDescList.add(new VarSelDesc(config, VarSelReason.HIGH_MISSING_RATE));
}
}
// 2. check KS and IV min threshold value
for (ColumnConfig config : columnConfigList) {
if (!config.isTarget() && !config.isMeta() && !config.isForceSelect() && config.isFinalSelect()) {
float minIvThreshold = (super.modelConfig.getVarSelect().getMinIvThreshold() == null ? 0f : super.modelConfig.getVarSelect().getMinIvThreshold());
if (config.getIv() != null && config.getIv() < minIvThreshold) {
log.warn("IV of column {} is less than minimal IV threshold, set final select to false. " + "If not, you can check it manually in ColumnConfig.json", config.getColumnName());
config.setFinalSelect(false);
varSelDescList.add(new VarSelDesc(config, VarSelReason.IV_TOO_LOW));
}
float minKsThreshold = (super.modelConfig.getVarSelect().getMinKsThreshold() == null ? 0f : super.modelConfig.getVarSelect().getMinKsThreshold());
if (config.getKs() != null && config.getKs() < minKsThreshold) {
log.warn("KS of column {} is less than minimal KS threshold, set final select to false. " + "If not, you can check it manually in ColumnConfig.json", config.getColumnName());
config.setFinalSelect(false);
varSelDescList.add(new VarSelDesc(config, VarSelReason.KS_TOO_LOW));
}
}
}
// 3. check correlation value:
if (!ShifuFileUtils.isFileExists(pathFinder.getLocalCorrelationCsvPath(), SourceType.LOCAL)) {
return;
}
varSelectByCorrelation(varSelDescList);
}
use of ml.shifu.shifu.core.history.VarSelDesc in project shifu by ShifuML.
the class VarSelectModelProcessor method recoverVarselStatusFromHist.
/**
* Recover auto-filtered variable status from varsel history file
*
* @param varselHistory
* - variable selection history file
* @throws IOException
*/
private void recoverVarselStatusFromHist(String varselHistory) throws IOException {
List<VarSelDesc> varSelDescList = loadVarSelDescList(varselHistory);
for (VarSelDesc varSelDesc : varSelDescList) {
ColumnConfig columnConfig = this.columnConfigList.get(varSelDesc.getColumnId());
if (columnConfig.isFinalSelect() == varSelDesc.getNewSelStatus()) {
log.info("Recover column - {} from {} to {}", varSelDesc.getColumnName(), varSelDesc.getNewSelStatus(), varSelDesc.getOldSelStatus());
columnConfig.setFinalSelect(varSelDesc.getOldSelStatus());
}
}
}
use of ml.shifu.shifu.core.history.VarSelDesc in project shifu by ShifuML.
the class VarSelectModelProcessor method varSelectByCorrelation.
// TODO refactor me please, bad function
private void varSelectByCorrelation(List<VarSelDesc> varSelDescList) throws IOException {
BufferedReader reader = ShifuFileUtils.getReader(pathFinder.getLocalCorrelationCsvPath(), SourceType.LOCAL);
int lineNum = 0;
try {
String line = null;
while ((line = reader.readLine()) != null) {
lineNum += 1;
if (lineNum <= 2) {
// skip first 2 lines which are indexes and names
continue;
}
String[] columns = CommonUtils.split(line, ",");
if (columns != null && columns.length == columnConfigList.size() + 2) {
int columnIndex = Integer.parseInt(columns[0].trim());
ColumnConfig config = this.columnConfigList.get(columnIndex);
// only check final-selected non-meta columns
if (config.isFinalSelect() || config.isTarget()) {
double[] corrArray = getCorrArray(columns);
for (int i = 0; i < corrArray.length; i++) {
// only check column larger than current column index and already final selected
if (config.getColumnNum() < i && (columnConfigList.get(i).isTarget() || columnConfigList.get(i).isFinalSelect())) {
// * 1.000005d is to avoid some value like 1.0000000002 in correlation value
if (Math.abs(corrArray[i]) > (modelConfig.getVarSelect().getCorrelationThreshold() * 1.000005d)) {
if (config.isForceSelect() && columnConfigList.get(i).isForceSelect()) {
log.warn("{} and {} has high correlated value but both not to be removed because both are force-selected", columnIndex, i);
} else if (config.isForceSelect() && !columnConfigList.get(i).isForceSelect()) {
log.warn("Absolute correlation value {} in column pair ({}, {}) ({}, {}) are larger than correlationThreshold value {} set in VarSelect#correlationThreshold, column {} name {} is not force-selected will not be selected, set finalSelect to false.", config.getColumnName(), columnConfigList.get(i).getColumnName(), modelConfig.getVarSelect().getCorrelationThreshold(), columnConfigList.get(i).getColumnNum(), columnConfigList.get(i).getColumnName());
columnConfigList.get(i).setFinalSelect(false);
varSelDescList.add(new VarSelDesc(columnConfigList.get(i), VarSelReason.HIGH_CORRELATED));
} else if (!config.isForceSelect() && columnConfigList.get(i).isForceSelect()) {
log.warn("Absolute correlation value {} in column pair ({}, {}) ({}, {}) are larger than correlationThreshold value {} set in VarSelect#correlationThreshold, column {} name {} is not force-selected will not be selected, set finalSelect to false.", config.getColumnName(), columnConfigList.get(i).getColumnName(), modelConfig.getVarSelect().getCorrelationThreshold(), config.getColumnNum(), config.getColumnName());
config.setFinalSelect(false);
varSelDescList.add(new VarSelDesc(config, VarSelReason.HIGH_CORRELATED));
} else if (config.isTarget() && columnConfigList.get(i).isFinalSelect()) {
log.warn("{} and {} has high correlated value while {} is target, {} is set to NOT final-selected no matter it is force-selected or not.", columnIndex, i, i);
columnConfigList.get(i).setFinalSelect(false);
} else if (config.isFinalSelect() && columnConfigList.get(i).isTarget()) {
log.warn("{} and {} has high correlated value while {} is target, {} is set to NOT final-selected no matter it is force-selected or not.", columnIndex, i, columnIndex);
config.setFinalSelect(false);
varSelDescList.add(new VarSelDesc(config, VarSelReason.HIGH_CORRELATED));
} else {
// both columns are not target and all final selected
ColumnConfig dropConfig = null;
PostCorrelationMetric corrMetric = modelConfig.getVarSelect().getPostCorrelationMetric();
if (checkCorrelationMetric(config, columnConfigList.get(i), corrMetric)) {
dropConfig = columnConfigList.get(i);
} else {
dropConfig = config;
}
// correlation comparison by SE RMS value
if ((this.modelConfig.getVarSelectFilterBy().equalsIgnoreCase(Constants.FILTER_BY_SE) || this.modelConfig.getVarSelectFilterBy().equalsIgnoreCase(Constants.FILTER_BY_ST)) && corrMetric == PostCorrelationMetric.SE && this.seStatsMap != null && this.seStatsMap.get(config.getColumnNum()) != null && this.seStatsMap.get(columnConfigList.get(i).getColumnNum()) != null) {
log.warn("Absolute correlation value {} in column pair ({}, {}) ({}, {}) are larger than correlationThreshold value {} set in VarSelect#correlationThreshold, column {} name {} with smaller SE RMS value will not be selected, set finalSelect to false.", Math.abs(corrArray[i]), config.getColumnNum(), i, config.getColumnName(), columnConfigList.get(i).getColumnName(), modelConfig.getVarSelect().getCorrelationThreshold(), dropConfig.getColumnNum(), dropConfig.getColumnName());
} else {
log.info("Absolute correlation value {} in column pair ({}, {}) ({}, {}) are larger than correlationThreshold value {} set in VarSelect#correlationThreshold, column {} name {} with smaller {} value will not be selected, set finalSelect to false.", Math.abs(corrArray[i]), config.getColumnNum(), i, config.getColumnName(), columnConfigList.get(i).getColumnName(), modelConfig.getVarSelect().getCorrelationThreshold(), dropConfig.getColumnNum(), dropConfig.getColumnName(), corrMetric);
}
// de-select column which is dropped in current logic
dropConfig.setFinalSelect(false);
varSelDescList.add(new VarSelDesc(dropConfig, VarSelReason.HIGH_CORRELATED));
}
}
}
}
}
}
}
} finally {
IOUtils.closeQuietly(reader);
}
}
Aggregations