use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.
the class NormalizeParquetUDF method exec.
@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
// do data sampling. Unselected data or data with invalid tag will be filtered out.
Object tag = input.get(tagColumnNum);
if (tag == null) {
log.warn("The tag is NULL, just skip it!!");
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
final String rawTag = CommonUtils.trimTag(tag.toString());
boolean isNotSampled = DataSampler.isNotSampled(posTags, negTags, modelConfig.getNormalizeSampleRate(), modelConfig.isNormalizeSampleNegOnly(), rawTag);
if (isNotSampled) {
return null;
}
// append tuple with tag, normalized value.
Tuple tuple = TupleFactory.getInstance().newTuple();
final NormType normType = modelConfig.getNormalizeType();
for (int i = 0; i < input.size(); i++) {
ColumnConfig config = columnConfigList.get(i);
String val = (input.get(i) == null) ? "" : input.get(i).toString();
// load variables for weight calculating.
if (weightExpr != null) {
weightContext.set(config.getColumnName(), val);
}
// check tag type.
if (tagColumnNum == i) {
String tagType = tagTypeCheck(posTags, negTags, rawTag);
if (tagType == null) {
log.error("Invalid data! The target value is not listed - " + rawTag);
return null;
}
tuple.append(Integer.parseInt(tagType));
continue;
}
// append normalize data.
if (!CommonUtils.isGoodCandidate(config, super.hasCandidates)) {
tuple.append((Double) null);
} else {
if (CommonUtils.isTreeModel(this.alg)) {
Double normVal = 0d;
if (config.isCategorical()) {
tuple.append(val);
} else {
try {
normVal = Double.parseDouble(val);
} catch (Exception e) {
log.debug("Not decimal format " + val + ", using default!");
normVal = Normalizer.defaultMissingValue(config);
}
}
tuple.append(normVal);
} else {
List<Double> normVals = Normalizer.normalize(config, val, cutoff, normType);
for (Double normVal : normVals) {
tuple.append(normVal);
}
}
}
}
// append tuple with weight.
double weight = evaluateWeight(weightExpr, weightContext);
tuple.append(weight);
return tuple;
}
use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.
the class NormalizeUDF method exec.
@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
Object tag = input.get(tagColumnNum);
if (tag == null) {
log.warn("The tag is NULL, just skip it!!");
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
final String rawTag = CommonUtils.trimTag(tag.toString());
// make sure all invalid tag record are filter out
if (!isLinearTarget && !super.tagSet.contains(rawTag)) {
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
// if(!isLinearTarget && !this.isForClean) {
if (!isLinearTarget) {
// do data sampling. Unselected data or data with invalid tag will be filtered out.
boolean isNotSampled = DataSampler.isNotSampled(modelConfig.isRegression(), super.tagSet, super.posTagSet, super.negTagSet, modelConfig.getNormalizeSampleRate(), modelConfig.isNormalizeSampleNegOnly(), rawTag);
if (isNotSampled) {
return null;
}
}
// append tuple with tag, normalized value.
Tuple tuple = TupleFactory.getInstance().newTuple();
final NormType normType = modelConfig.getNormalizeType();
Map<String, Object> compactVarMap = null;
if (this.isCompactNorm) {
compactVarMap = new HashMap<String, Object>();
}
if (!this.isForExpressions) {
if (input.size() != this.columnConfigList.size()) {
this.mismatchCnt++;
log.error("the input size - " + input.size() + ", while column size - " + columnConfigList.size());
this.mismatchCnt++;
// this could make Shifu could skip some malformed data
if (this.mismatchCnt > MAX_MISMATCH_CNT) {
throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
}
return null;
}
for (int i = 0; i < input.size(); i++) {
ColumnConfig config = columnConfigList.get(i);
String val = (input.get(i) == null) ? "" : input.get(i).toString();
// load variables for weight calculating.
if (weightExpr != null) {
weightContext.set(new NSColumn(config.getColumnName()).getSimpleName(), val);
}
// check tag type.
if (tagColumnNum == i) {
if (modelConfig.isRegression()) {
int type = 0;
if (super.posTagSet.contains(rawTag)) {
type = 1;
} else if (super.negTagSet.contains(rawTag)) {
type = 0;
} else {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
} else {
tuple.append(type);
}
} else if (this.isLinearTarget) {
double tagValue = 0.0;
try {
tagValue = Double.parseDouble(rawTag);
} catch (Exception e) {
log.error("Tag - " + rawTag + " is invalid(not numerical). Skip record.");
// skip this line
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), tagValue);
} else {
tuple.append(tagValue);
}
} else {
int index = -1;
for (int j = 0; j < tags.size(); j++) {
Set<String> tagSet = tags.get(j);
if (tagSet.contains(rawTag)) {
index = j;
break;
}
}
if (index == -1) {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
} else {
tuple.append(index);
}
}
continue;
}
if (this.isForClean) {
// for RF/GBT model, only clean data, not real do norm data
if (config.isCategorical()) {
Map<String, Integer> map = this.categoricalIndexMap.get(config.getColumnNum());
// map should not be null, no need check if map is null, if val not in binCategory, set it to ""
tuple.append(((map.get(val) == null || map.get(val) == -1)) ? "" : val);
} else {
Double normVal = 0d;
try {
normVal = Double.parseDouble(val);
} catch (Exception e) {
log.debug("Not decimal format " + val + ", using default!");
normVal = Normalizer.defaultMissingValue(config);
}
appendOutputValue(tuple, normVal, true);
}
} else {
if (this.isCompactNorm) {
// only output features and target, weight in compact norm mode
if (!config.isMeta() && config.isFinalSelect()) {
// for multiple classification, binPosRate means rate of such category over all counts,
// reuse binPosRate for normalize
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
String formatVal = getOutputValue(normVal, true);
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
}
} else if (config.isMeta()) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
} else {
// if is compact mode but such column is not final selected, should be empty, as only append
// target and finalSelect feature, no need append here so this code block is empty. TODO, do
// we need meta column?
}
} else {
// it will cause variable fail to normalize
if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
// for multiple classification, binPosRate means rate of such category over all counts,
// reuse binPosRate for normalize
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
appendOutputValue(tuple, normVal, true);
}
} else {
tuple.append(config.isMeta() ? val : null);
}
}
}
}
} else {
// for segment expansion variables
int rawSize = input.size();
for (int i = 0; i < this.columnConfigList.size(); i++) {
ColumnConfig config = this.columnConfigList.get(i);
int newIndex = i >= rawSize ? i % rawSize : i;
String val = (input.get(newIndex) == null) ? "" : input.get(newIndex).toString();
// for target column
if (config.isTarget()) {
if (modelConfig.isRegression()) {
int type = 0;
if (super.posTagSet.contains(rawTag)) {
type = 1;
} else if (super.negTagSet.contains(rawTag)) {
type = 0;
} else {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
} else {
tuple.append(type);
}
} else {
int index = -1;
for (int j = 0; j < tags.size(); j++) {
Set<String> tagSet = tags.get(j);
if (tagSet.contains(rawTag)) {
index = j;
break;
}
}
if (index == -1) {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
} else {
tuple.append(index);
}
}
continue;
}
if (this.isCompactNorm) {
// only output features and target, weight in compact norm mode
if (!config.isMeta() && config.isFinalSelect()) {
// for multiple classification, binPosRate means rate of such category over all counts,
// reuse binPosRate for normalize
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
String formatVal = getOutputValue(normVal, true);
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
}
} else if (config.isMeta()) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
} else {
// if is compact mode but such column is not final selected, should be empty, as only append
// target and finalSelect feature, no need append here so this code block is empty. TODO, do
// we need meta column?
}
} else {
// for others
if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
appendOutputValue(tuple, normVal, true);
}
} else {
tuple.append(config.isMeta() ? val : null);
}
}
}
}
// for compact norm mode, output to tuple at here
if (this.isCompactNorm) {
for (int i = 0; i < outputCompactColumns.size(); i++) {
tuple.append(compactVarMap.get(outputCompactColumns.get(i)));
}
}
// append tuple with weight.
double weight = evaluateWeight(weightExpr, weightContext);
tuple.append(weight);
return tuple;
}
use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.
the class CommonUtils method loadColumnConfigList.
/**
* Load column configuration list.
*
* @param path
* file path
* @param sourceType
* source type: hdfs or local
* @param nullSampleValues
* if sample values null or not to save memory especially in Pig UDF to save more memory. there is a OOM
* if larger ColumnConfig.json.
* @return column config list
* @throws IOException
* if any IO exception in parsing json.
* @throws IllegalArgumentException
* if {@code path} is null or empty, if sourceType is null.
*/
public static List<ColumnConfig> loadColumnConfigList(String path, SourceType sourceType, boolean nullSampleValues) throws IOException {
ColumnConfig[] configList = loadJSON(path, sourceType, ColumnConfig[].class);
List<ColumnConfig> columnConfigList = new ArrayList<ColumnConfig>();
for (ColumnConfig columnConfig : configList) {
// reset sample values to null to save memory
if (nullSampleValues) {
columnConfig.setSampleValues(null);
}
// construct Category Index map for fast query.
if (columnConfig.isCategorical() && columnConfig.getColumnBinning() != null && columnConfig.getColumnBinning().getBinCategory() != null) {
List<String> categories = columnConfig.getColumnBinning().getBinCategory();
Map<String, Integer> categoryIndexMapping = new HashMap<String, Integer>();
for (int i = 0; i < categories.size(); i++) {
String category = categories.get(i);
if (category.contains(Constants.CATEGORICAL_GROUP_VAL_DELIMITER)) {
// merged category should be flatten, use split function this class to avoid depending on guava
String[] splits = ml.shifu.shifu.core.dtrain.StringUtils.split(category, Constants.CATEGORICAL_GROUP_VAL_DELIMITER);
for (String str : splits) {
categoryIndexMapping.put(str, i);
}
} else {
categoryIndexMapping.put(category, i);
}
}
columnConfig.getColumnBinning().setBinCateMap(categoryIndexMapping);
}
columnConfigList.add(columnConfig);
}
return columnConfigList;
}
use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.
the class BinningDataMergeUDF method exec.
/*
* (non-Javadoc)
*
* @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
*/
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null) {
return null;
}
Integer columnId = (Integer) input.get(0);
DataBag databag = (DataBag) input.get(1);
int corrColumnId = columnId;
if (corrColumnId >= super.columnConfigList.size()) {
corrColumnId = corrColumnId % super.columnConfigList.size();
}
ColumnConfig columnConfig = super.columnConfigList.get(corrColumnId);
AbstractBinning<?> binning = null;
AbstractBinning<?> backupBinning = null;
log.info("Start merging bin info for columnId - " + columnId + ", the bag size is - " + databag.size());
Iterator<Tuple> iterator = databag.iterator();
while (iterator.hasNext()) {
Tuple element = iterator.next();
if (element == null || element.size() < 2) {
continue;
}
String objValStr = (String) element.get(1);
String hybridCateValStr = null;
long start = System.currentTimeMillis();
// for hybrid, split
if (columnConfig.isHybrid()) {
String[] splits = CommonUtils.split(objValStr, Constants.HYBRID_BIN_STR_DILIMETER);
objValStr = splits[0];
hybridCateValStr = splits[1];
}
AbstractBinning<?> partialBinning = AbstractBinning.constructBinningFromStr(modelConfig, columnConfig, objValStr);
AbstractBinning<?> partialBackupBinning = null;
if (columnConfig.isHybrid()) {
partialBackupBinning = new CategoricalBinning();
partialBackupBinning.stringToObj(hybridCateValStr);
}
log.info("constructBinningFromStr: " + (System.currentTimeMillis() - start) + "ms");
start = System.currentTimeMillis();
if (binning == null) {
binning = partialBinning;
if (columnConfig.isHybrid()) {
backupBinning = partialBackupBinning;
}
} else {
binning.mergeBin(partialBinning);
if (columnConfig.isHybrid()) {
backupBinning.mergeBin(partialBackupBinning);
}
}
log.info("mergeBin: " + (System.currentTimeMillis() - start) + "ms");
}
Tuple output = TupleFactory.getInstance().newTuple(2);
output.set(0, columnId);
List<?> binFields = binning.getDataBin();
// it will consume too much memory when join them together, that will cause OOM exception
if (columnConfig.isCategorical() && binFields.size() > this.maxCategorySize) {
log.warn(columnId + " " + columnConfig.getColumnName() + " is over maximal categorical size: " + this.maxCategorySize);
output.set(1, "");
} else {
if (columnConfig.isHybrid()) {
String finalBinStr = StringUtils.join(binFields, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR);
finalBinStr += Constants.HYBRID_BIN_STR_DILIMETER + StringUtils.join(backupBinning.getDataBin(), CalculateStatsUDF.CATEGORY_VAL_SEPARATOR);
output.set(1, finalBinStr);
} else {
output.set(1, StringUtils.join(binFields, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR));
}
}
log.info("Finish merging bin info for columnId - " + columnId);
return output;
}
use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.
the class BinningDataUDF method exec.
/*
* (non-Javadoc)
*
* @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
*/
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() < 2) {
return null;
}
Integer columnId = (Integer) input.get(0);
DataBag databag = (DataBag) input.get(1);
ColumnConfig columnConfig = super.columnConfigList.get(columnId);
AbstractBinning<?> binning = null;
if (columnConfig.isCategorical()) {
binning = new CategoricalBinning(-1, super.modelConfig.getMissingOrInvalidValues(), this.maxCategorySize);
} else {
if (super.modelConfig.getBinningMethod().equals(BinningMethod.EqualInterval)) {
binning = new EqualIntervalBinning(modelConfig.getStats().getMaxNumBin());
} else {
switch(this.modelConfig.getBinningAlgorithm()) {
case Native:
log.info("Invoke Native binning method, memory cosuming!!");
// always merge bins
binning = new NativeBinning(modelConfig.getStats().getMaxNumBin(), true);
break;
case SPDT:
case SPDTI:
log.info("Invoke SPDT(Streaming Parallel Decision Tree) binning method, ");
binning = new EqualPopulationBinning(modelConfig.getStats().getMaxNumBin());
break;
case MunroPat:
case MunroPatI:
log.info("Invoke Munro & Paterson selecting algorithm");
binning = new MunroPatBinning(modelConfig.getStats().getMaxNumBin());
break;
default:
log.info("Default: Invoke Munro & Paterson selecting algorithm");
binning = new MunroPatBinning(modelConfig.getStats().getMaxNumBin());
break;
}
}
}
Iterator<Tuple> iterator = databag.iterator();
while (iterator.hasNext()) {
Tuple element = iterator.next();
if (element == null || element.size() < 2) {
continue;
}
Object value = element.get(1);
if (value != null) {
binning.addData(value.toString());
}
}
Tuple output = TupleFactory.getInstance().newTuple(2);
output.set(0, columnId);
// Do check here. It's because if there are too many value for categorical variable,
// it will consume too much memory when join them together, that will cause OOM exception
List<?> dataBin = binning.getDataBin();
if (dataBin.size() > this.maxCategorySize) {
output.set(1, "");
} else {
output.set(1, StringUtils.join(dataBin, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR));
}
log.info("Finish merging bin info for columnId - " + columnId);
return output;
}
Aggregations