use of ml.shifu.shifu.core.binning.CategoricalBinning in project shifu by ShifuML.
the class BinningDataMergeUDF method exec.
/*
* (non-Javadoc)
*
* @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
*/
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null) {
return null;
}
Integer columnId = (Integer) input.get(0);
DataBag databag = (DataBag) input.get(1);
int corrColumnId = columnId;
if (corrColumnId >= super.columnConfigList.size()) {
corrColumnId = corrColumnId % super.columnConfigList.size();
}
ColumnConfig columnConfig = super.columnConfigList.get(corrColumnId);
AbstractBinning<?> binning = null;
AbstractBinning<?> backupBinning = null;
log.info("Start merging bin info for columnId - " + columnId + ", the bag size is - " + databag.size());
Iterator<Tuple> iterator = databag.iterator();
while (iterator.hasNext()) {
Tuple element = iterator.next();
if (element == null || element.size() < 2) {
continue;
}
String objValStr = (String) element.get(1);
String hybridCateValStr = null;
long start = System.currentTimeMillis();
// for hybrid, split
if (columnConfig.isHybrid()) {
String[] splits = CommonUtils.split(objValStr, Constants.HYBRID_BIN_STR_DILIMETER);
objValStr = splits[0];
hybridCateValStr = splits[1];
}
AbstractBinning<?> partialBinning = AbstractBinning.constructBinningFromStr(modelConfig, columnConfig, objValStr);
AbstractBinning<?> partialBackupBinning = null;
if (columnConfig.isHybrid()) {
partialBackupBinning = new CategoricalBinning();
partialBackupBinning.stringToObj(hybridCateValStr);
}
log.info("constructBinningFromStr: " + (System.currentTimeMillis() - start) + "ms");
start = System.currentTimeMillis();
if (binning == null) {
binning = partialBinning;
if (columnConfig.isHybrid()) {
backupBinning = partialBackupBinning;
}
} else {
binning.mergeBin(partialBinning);
if (columnConfig.isHybrid()) {
backupBinning.mergeBin(partialBackupBinning);
}
}
log.info("mergeBin: " + (System.currentTimeMillis() - start) + "ms");
}
Tuple output = TupleFactory.getInstance().newTuple(2);
output.set(0, columnId);
List<?> binFields = binning.getDataBin();
// it will consume too much memory when join them together, that will cause OOM exception
if (columnConfig.isCategorical() && binFields.size() > this.maxCategorySize) {
log.warn(columnId + " " + columnConfig.getColumnName() + " is over maximal categorical size: " + this.maxCategorySize);
output.set(1, "");
} else {
if (columnConfig.isHybrid()) {
String finalBinStr = StringUtils.join(binFields, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR);
finalBinStr += Constants.HYBRID_BIN_STR_DILIMETER + StringUtils.join(backupBinning.getDataBin(), CalculateStatsUDF.CATEGORY_VAL_SEPARATOR);
output.set(1, finalBinStr);
} else {
output.set(1, StringUtils.join(binFields, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR));
}
}
log.info("Finish merging bin info for columnId - " + columnId);
return output;
}
use of ml.shifu.shifu.core.binning.CategoricalBinning in project shifu by ShifuML.
the class BinningDataUDF method exec.
/*
* (non-Javadoc)
*
* @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
*/
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() < 2) {
return null;
}
Integer columnId = (Integer) input.get(0);
DataBag databag = (DataBag) input.get(1);
ColumnConfig columnConfig = super.columnConfigList.get(columnId);
AbstractBinning<?> binning = null;
if (columnConfig.isCategorical()) {
binning = new CategoricalBinning(-1, super.modelConfig.getMissingOrInvalidValues(), this.maxCategorySize);
} else {
if (super.modelConfig.getBinningMethod().equals(BinningMethod.EqualInterval)) {
binning = new EqualIntervalBinning(modelConfig.getStats().getMaxNumBin());
} else {
switch(this.modelConfig.getBinningAlgorithm()) {
case Native:
log.info("Invoke Native binning method, memory cosuming!!");
// always merge bins
binning = new NativeBinning(modelConfig.getStats().getMaxNumBin(), true);
break;
case SPDT:
case SPDTI:
log.info("Invoke SPDT(Streaming Parallel Decision Tree) binning method, ");
binning = new EqualPopulationBinning(modelConfig.getStats().getMaxNumBin());
break;
case MunroPat:
case MunroPatI:
log.info("Invoke Munro & Paterson selecting algorithm");
binning = new MunroPatBinning(modelConfig.getStats().getMaxNumBin());
break;
default:
log.info("Default: Invoke Munro & Paterson selecting algorithm");
binning = new MunroPatBinning(modelConfig.getStats().getMaxNumBin());
break;
}
}
}
Iterator<Tuple> iterator = databag.iterator();
while (iterator.hasNext()) {
Tuple element = iterator.next();
if (element == null || element.size() < 2) {
continue;
}
Object value = element.get(1);
if (value != null) {
binning.addData(value.toString());
}
}
Tuple output = TupleFactory.getInstance().newTuple(2);
output.set(0, columnId);
// Do check here. It's because if there are too many value for categorical variable,
// it will consume too much memory when join them together, that will cause OOM exception
List<?> dataBin = binning.getDataBin();
if (dataBin.size() > this.maxCategorySize) {
output.set(1, "");
} else {
output.set(1, StringUtils.join(dataBin, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR));
}
log.info("Finish merging bin info for columnId - " + columnId);
return output;
}
use of ml.shifu.shifu.core.binning.CategoricalBinning in project shifu by ShifuML.
the class BinningPartialDataUDF method exec.
/*
* (non-Javadoc)
*
* @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
*/
@Override
public String exec(Tuple input) throws IOException {
if (input == null) {
return null;
}
DataBag databag = (DataBag) input.get(0);
Iterator<Tuple> iterator = databag.iterator();
while (iterator.hasNext()) {
Tuple element = iterator.next();
if (element == null) {
continue;
}
if (columnId < 0) {
columnId = (Integer) element.get(0);
if (columnId >= super.columnConfigList.size()) {
columnId = columnId % super.columnConfigList.size();
}
columnConfig = super.columnConfigList.get(columnId);
if (columnConfig.isCategorical()) {
binning = new CategoricalBinning(-1, modelConfig.getMissingOrInvalidValues(), this.maxCategorySize);
} else {
if (super.modelConfig.getBinningMethod().equals(BinningMethod.EqualInterval)) {
binning = new EqualIntervalBinning(modelConfig.getStats().getMaxNumBin() > 0 ? modelConfig.getStats().getMaxNumBin() : 1024, modelConfig.getMissingOrInvalidValues());
} else {
binning = new EqualPopulationBinning(modelConfig.getStats().getMaxNumBin() > 0 ? modelConfig.getStats().getMaxNumBin() : 1024, modelConfig.getMissingOrInvalidValues());
}
}
if (columnConfig.isHybrid()) {
this.backUpbinning = new CategoricalBinning(-1, modelConfig.getMissingOrInvalidValues(), this.maxCategorySize);
}
}
Object value = element.get(1);
if (value != null) {
String valStr = value.toString();
if (isWeightBinningMethod() && binning instanceof EqualPopulationBinning) {
((EqualPopulationBinning) binning).addData(valStr, (Double) element.get(AddColumnNumUDF.COLUMN_WEIGHT_INDX));
} else {
binning.addData(valStr);
}
if (this.columnConfig.isHybrid()) {
// missing value and not number value go to categorical binning
double douVal = BinUtils.parseNumber(valStr);
Double hybridThreshould = this.columnConfig.getHybridThreshold();
if (hybridThreshould == null) {
hybridThreshould = Double.NEGATIVE_INFINITY;
}
// douVal < hybridThreshould which will also be set to category
boolean isCategory = Double.isNaN(douVal) || douVal < hybridThreshould;
if (douVal < hybridThreshould) {
log.warn("douVal " + douVal + ", threshold " + hybridThreshould + ", column {}" + columnConfig.getColumnName());
}
if (binning.isMissingVal(valStr) || isCategory) {
this.backUpbinning.addData(valStr);
}
}
}
}
String binningObjStr = ((binning == null) ? null : binning.objToString());
if (this.columnConfig.isHybrid()) {
binningObjStr += Constants.HYBRID_BIN_STR_DILIMETER + this.backUpbinning.objToString();
}
cleanUp();
return binningObjStr;
}
Aggregations