use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class BasicUpdater method updateColumnConfig.
public void updateColumnConfig(ColumnConfig columnConfig) {
String varName = columnConfig.getColumnName();
// reset flag at first
columnConfig.setColumnFlag(null);
if (NSColumnUtils.isColumnEqual(this.targetColumnName, varName)) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Target);
columnConfig.setColumnType(null);
} else if (this.setMeta.contains(new NSColumn(varName))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Meta);
columnConfig.setColumnType(null);
} else if (this.setForceRemove.contains(new NSColumn(varName))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceRemove);
} else if (this.setForceSelect.contains(new NSColumn(varName))) {
if (CollectionUtils.isEmpty(this.setCandidates) || (// candidates is not empty
CollectionUtils.isNotEmpty(this.setCandidates) && this.setCandidates.contains(new NSColumn(varName)))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceSelect);
}
} else if (NSColumnUtils.isColumnEqual(this.weightColumnName, varName)) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Weight);
columnConfig.setColumnType(null);
} else if (this.setCandidates.contains(new NSColumn(varName))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Candidate);
}
if (NSColumnUtils.isColumnEqual(weightColumnName, varName)) {
// weight column is numerical
columnConfig.setColumnType(ColumnType.N);
} else if (NSColumnUtils.isColumnEqual(targetColumnName, varName)) {
if (CollectionUtils.isEmpty(this.modelConfig.getTags())) {
// allow tags are empty to support linear target
// set columnType to N
columnConfig.setColumnType(ColumnType.N);
} else {
// target column is set to categorical column
columnConfig.setColumnType(ColumnType.C);
}
} else if (setHybridColumns.contains(new NSColumn(varName))) {
columnConfig.setColumnType(ColumnType.H);
String newVarName = null;
if (Environment.getBoolean(Constants.SHIFU_NAMESPACE_STRICT_MODE, false)) {
newVarName = new NSColumn(varName).getFullColumnName();
} else {
newVarName = new NSColumn(varName).getSimpleName();
}
columnConfig.setHybridThreshold(hybridColumnNames.get(newVarName));
} else if (setCategorialColumns.contains(new NSColumn(varName))) {
columnConfig.setColumnType(ColumnType.C);
} else {
// meta and other columns are set to numerical if user not set it in categorical column configuration file
columnConfig.setColumnType(ColumnType.N);
}
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class TrainUpdater method updateColumnConfig.
@Override
public void updateColumnConfig(ColumnConfig columnConfig) {
// reset flag at first
columnConfig.setColumnFlag(null);
String varName = columnConfig.getColumnName();
if (NSColumnUtils.isColumnEqual(this.targetColumnName, varName)) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Target);
if (CollectionUtils.isEmpty(this.modelConfig.getTags())) {
// allow tags are empty to support linear target
// set columnType to N
columnConfig.setColumnType(ColumnType.N);
} else {
// target column is set to categorical column
columnConfig.setColumnType(ColumnType.C);
}
} else if (this.setMeta.contains(new NSColumn(varName))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Meta);
// set to false is OK as if no column are selected, set to false still no one selected
columnConfig.setFinalSelect(false);
} else if (this.setForceRemove.contains(new NSColumn(varName))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceRemove);
// set to false is OK as if no column are selected, set to false still no one selected
columnConfig.setFinalSelect(false);
} else if (this.setForceSelect.contains(new NSColumn(varName))) {
if (CollectionUtils.isEmpty(this.setCandidates) || (CollectionUtils.isNotEmpty(this.setCandidates) && // candidates is not empty
this.setCandidates.contains(new NSColumn(varName)))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.ForceSelect);
// WARN: should not set final select here, imagine user take varsel by SE, the first time is to call
// training a model, then forceselected columns will be set to final selected, then all varaibles
// selected are only in current final selected columns which is not correct.
// There is a situation like this - after variable selection, user may want to update forselect list
// and train the model again, if we don't set finalSelect = true, those new added variables won't be
// used. Or user need to run variable selection again. Let's figure out a solution to fix this.
}
} else if (this.setCandidates.contains(new NSColumn(varName))) {
columnConfig.setColumnFlag(ColumnConfig.ColumnFlag.Candidate);
}
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class NormalizeUDF method exec.
@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
Object tag = input.get(tagColumnNum);
if (tag == null) {
log.warn("The tag is NULL, just skip it!!");
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
final String rawTag = CommonUtils.trimTag(tag.toString());
// make sure all invalid tag record are filter out
if (!isLinearTarget && !super.tagSet.contains(rawTag)) {
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
// if(!isLinearTarget && !this.isForClean) {
if (!isLinearTarget) {
// do data sampling. Unselected data or data with invalid tag will be filtered out.
boolean isNotSampled = DataSampler.isNotSampled(modelConfig.isRegression(), super.tagSet, super.posTagSet, super.negTagSet, modelConfig.getNormalizeSampleRate(), modelConfig.isNormalizeSampleNegOnly(), rawTag);
if (isNotSampled) {
return null;
}
}
// append tuple with tag, normalized value.
Tuple tuple = TupleFactory.getInstance().newTuple();
final NormType normType = modelConfig.getNormalizeType();
Map<String, Object> compactVarMap = null;
if (this.isCompactNorm) {
compactVarMap = new HashMap<String, Object>();
}
if (!this.isForExpressions) {
if (input.size() != this.columnConfigList.size()) {
this.mismatchCnt++;
log.error("the input size - " + input.size() + ", while column size - " + columnConfigList.size());
this.mismatchCnt++;
// this could make Shifu could skip some malformed data
if (this.mismatchCnt > MAX_MISMATCH_CNT) {
throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
}
return null;
}
for (int i = 0; i < input.size(); i++) {
ColumnConfig config = columnConfigList.get(i);
String val = (input.get(i) == null) ? "" : input.get(i).toString();
// load variables for weight calculating.
if (weightExpr != null) {
weightContext.set(new NSColumn(config.getColumnName()).getSimpleName(), val);
}
// check tag type.
if (tagColumnNum == i) {
if (modelConfig.isRegression()) {
int type = 0;
if (super.posTagSet.contains(rawTag)) {
type = 1;
} else if (super.negTagSet.contains(rawTag)) {
type = 0;
} else {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
} else {
tuple.append(type);
}
} else if (this.isLinearTarget) {
double tagValue = 0.0;
try {
tagValue = Double.parseDouble(rawTag);
} catch (Exception e) {
log.error("Tag - " + rawTag + " is invalid(not numerical). Skip record.");
// skip this line
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), tagValue);
} else {
tuple.append(tagValue);
}
} else {
int index = -1;
for (int j = 0; j < tags.size(); j++) {
Set<String> tagSet = tags.get(j);
if (tagSet.contains(rawTag)) {
index = j;
break;
}
}
if (index == -1) {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
} else {
tuple.append(index);
}
}
continue;
}
if (this.isForClean) {
// for RF/GBT model, only clean data, not real do norm data
if (config.isCategorical()) {
Map<String, Integer> map = this.categoricalIndexMap.get(config.getColumnNum());
// map should not be null, no need check if map is null, if val not in binCategory, set it to ""
tuple.append(((map.get(val) == null || map.get(val) == -1)) ? "" : val);
} else {
Double normVal = 0d;
try {
normVal = Double.parseDouble(val);
} catch (Exception e) {
log.debug("Not decimal format " + val + ", using default!");
normVal = Normalizer.defaultMissingValue(config);
}
appendOutputValue(tuple, normVal, true);
}
} else {
if (this.isCompactNorm) {
// only output features and target, weight in compact norm mode
if (!config.isMeta() && config.isFinalSelect()) {
// for multiple classification, binPosRate means rate of such category over all counts,
// reuse binPosRate for normalize
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
String formatVal = getOutputValue(normVal, true);
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
}
} else if (config.isMeta()) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
} else {
// if is compact mode but such column is not final selected, should be empty, as only append
// target and finalSelect feature, no need append here so this code block is empty. TODO, do
// we need meta column?
}
} else {
// it will cause variable fail to normalize
if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
// for multiple classification, binPosRate means rate of such category over all counts,
// reuse binPosRate for normalize
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
appendOutputValue(tuple, normVal, true);
}
} else {
tuple.append(config.isMeta() ? val : null);
}
}
}
}
} else {
// for segment expansion variables
int rawSize = input.size();
for (int i = 0; i < this.columnConfigList.size(); i++) {
ColumnConfig config = this.columnConfigList.get(i);
int newIndex = i >= rawSize ? i % rawSize : i;
String val = (input.get(newIndex) == null) ? "" : input.get(newIndex).toString();
// for target column
if (config.isTarget()) {
if (modelConfig.isRegression()) {
int type = 0;
if (super.posTagSet.contains(rawTag)) {
type = 1;
} else if (super.negTagSet.contains(rawTag)) {
type = 0;
} else {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
} else {
tuple.append(type);
}
} else {
int index = -1;
for (int j = 0; j < tags.size(); j++) {
Set<String> tagSet = tags.get(j);
if (tagSet.contains(rawTag)) {
index = j;
break;
}
}
if (index == -1) {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
} else {
tuple.append(index);
}
}
continue;
}
if (this.isCompactNorm) {
// only output features and target, weight in compact norm mode
if (!config.isMeta() && config.isFinalSelect()) {
// for multiple classification, binPosRate means rate of such category over all counts,
// reuse binPosRate for normalize
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
String formatVal = getOutputValue(normVal, true);
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
}
} else if (config.isMeta()) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
} else {
// if is compact mode but such column is not final selected, should be empty, as only append
// target and finalSelect feature, no need append here so this code block is empty. TODO, do
// we need meta column?
}
} else {
// for others
if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
appendOutputValue(tuple, normVal, true);
}
} else {
tuple.append(config.isMeta() ? val : null);
}
}
}
}
// for compact norm mode, output to tuple at here
if (this.isCompactNorm) {
for (int i = 0; i < outputCompactColumns.size(); i++) {
tuple.append(compactVarMap.get(outputCompactColumns.get(i)));
}
}
// append tuple with weight.
double weight = evaluateWeight(weightExpr, weightContext);
tuple.append(weight);
return tuple;
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class SimpleScoreUDF method exec.
public Double exec(Tuple input) throws IOException {
Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.header, 0);
CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap);
if (cs == null) {
log.error("Get null result.");
return null;
}
String tag = CommonUtils.trimTag(rawDataNsMap.get(new NSColumn(targetColumnName)));
if (!(negTags.contains(tag) || posTags.contains(tag))) {
// invalid record
log.error("Detected invalid record. Its tag is - " + tag);
return null;
}
return cs.getAvgScore();
}
use of ml.shifu.shifu.column.NSColumn in project shifu by ShifuML.
the class VarSelectModelProcessor method postProcess4SEVarSelect.
private void postProcess4SEVarSelect(SourceType source, String varSelectMSEOutputPath) throws IOException {
String outputFilePattern = varSelectMSEOutputPath + Path.SEPARATOR + "part-r-*";
if (!ShifuFileUtils.isFileExists(outputFilePattern, source)) {
throw new RuntimeException("Var select MSE stats output file not exist.");
}
int selectCnt = 0;
for (ColumnConfig config : super.columnConfigList) {
if (config.isFinalSelect()) {
config.setFinalSelect(false);
}
// enable ForceSelect
if (config.isForceSelect()) {
config.setFinalSelect(true);
selectCnt++;
log.info("Variable {} is selected, since it is in ForceSelect list.", config.getColumnName());
}
}
Set<NSColumn> userCandidateColumns = CommonUtils.loadCandidateColumns(modelConfig);
List<Scanner> scanners = null;
try {
// here only works for 1 reducer
FileStatus[] globStatus = ShifuFileUtils.getFileSystemBySourceType(source).globStatus(new Path(outputFilePattern));
if (globStatus == null || globStatus.length == 0) {
throw new RuntimeException("Var select MSE stats output file not exist.");
}
scanners = ShifuFileUtils.getDataScanners(globStatus[0].getPath().toString(), source);
String str = null;
// total variable count that user want to select
int targetCnt = 0;
List<Integer> candidateColumnIdList = new ArrayList<Integer>();
Scanner scanner = scanners.get(0);
while (scanner.hasNext()) {
++targetCnt;
str = scanner.nextLine().trim();
candidateColumnIdList.add(Integer.parseInt(str));
}
int i = 0;
int candidateCount = candidateColumnIdList.size();
// force-selected variables
while (selectCnt < targetCnt && i < targetCnt) {
if (i >= candidateCount) {
log.warn("Var select finish due candidate column {} is less than target var count {}", candidateCount, targetCnt);
break;
}
Integer columnId = candidateColumnIdList.get(i++);
// after supporting segments, the columns will expansion. the columnId may not the position
// in columnConfigList. It's safe to columnId to search (make sure columnNum == columnId)
ColumnConfig columnConfig = CommonUtils.getColumnConfig(this.columnConfigList, columnId);
if (CollectionUtils.isNotEmpty(userCandidateColumns) && !userCandidateColumns.contains(new NSColumn(columnConfig.getColumnName()))) {
log.info("Variable {} is not in user's candidate list. Skip it.", columnConfig.getColumnName());
} else if (!columnConfig.isForceSelect() && !columnConfig.isForceRemove()) {
columnConfig.setFinalSelect(true);
selectCnt++;
log.info("Variable {} is selected.", columnConfig.getColumnName());
}
}
log.info("{} variables are selected.", selectCnt);
log.info("Sensitivity analysis report is in {}/{}-* file(s) with format 'column_index\tcolumn_name\tmean\trms\tvariance'.", varSelectMSEOutputPath, Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME);
this.seStatsMap = readSEValuesToMap(varSelectMSEOutputPath + Path.SEPARATOR + Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME + "-*", source);
} finally {
if (scanners != null) {
for (Scanner scanner : scanners) {
if (scanner != null) {
scanner.close();
}
}
}
}
}
Aggregations