use of ml.shifu.shifu.core.BasicStatsCalculator in project shifu by ShifuML.
the class StatsCalculateWorker method calculateColumnStats.
/**
* Do the stats calculation
*
* @param columnConfig
* @param valueObjList
*/
private void calculateColumnStats(ColumnConfig columnConfig, List<ValueObject> valueObjList) {
if (CollectionUtils.isEmpty(valueObjList)) {
log.error("No values for column : {}, please check!", columnConfig.getColumnName());
return;
}
BinningDataType dataType;
if (columnConfig.isNumerical()) {
dataType = BinningDataType.Numerical;
} else if (columnConfig.isCategorical()) {
dataType = BinningDataType.Categorical;
} else {
dataType = BinningDataType.Auto;
}
// Binning
Binning binning = new Binning(modelConfig.getPosTags(), modelConfig.getNegTags(), dataType, valueObjList);
log.info("posTags - {}, negTags - {}, first example tag - {}", modelConfig.getPosTags(), modelConfig.getNegTags(), valueObjList.get(0).getTag());
binning.setMaxNumOfBins(modelConfig.getBinningExpectedNum());
binning.setBinningMethod(modelConfig.getBinningMethod());
binning.setAutoTypeThreshold(modelConfig.getAutoTypeThreshold());
binning.setMergeEnabled(Boolean.TRUE);
binning.doBinning();
// Calculate Basic Stats
BasicStatsCalculator basicStatsCalculator = new BasicStatsCalculator(binning.getUpdatedVoList(), modelConfig.getNumericalValueThreshold());
// Calculate KSIV, based on Binning result
ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binning.getBinCountNeg(), binning.getBinCountPos());
ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(binning.getBinWeightedNeg(), binning.getBinWeightedPos());
dataType = binning.getUpdatedDataType();
if (dataType.equals(BinningDataType.Numerical)) {
columnConfig.setColumnType(ColumnType.N);
columnConfig.setBinBoundary(binning.getBinBoundary());
} else {
columnConfig.setColumnType(ColumnType.C);
columnConfig.setBinCategory(binning.getBinCategory());
}
columnConfig.setBinCountNeg(binning.getBinCountNeg());
columnConfig.setBinCountPos(binning.getBinCountPos());
columnConfig.setBinPosCaseRate(binning.getBinPosCaseRate());
columnConfig.setMax(basicStatsCalculator.getMax());
columnConfig.setMin(basicStatsCalculator.getMin());
columnConfig.setMean(basicStatsCalculator.getMean());
columnConfig.setStdDev(basicStatsCalculator.getStdDev());
columnConfig.setMedian(basicStatsCalculator.getMedian());
columnConfig.setBinWeightedNeg(binning.getBinWeightedNeg());
columnConfig.setBinWeightedPos(binning.getBinWeightedPos());
if (columnCountMetrics != null) {
columnConfig.setKs(columnCountMetrics.getKs());
columnConfig.setIv(columnCountMetrics.getIv());
columnConfig.getColumnStats().setWoe(columnCountMetrics.getWoe());
columnConfig.getColumnBinning().setBinCountWoe(columnCountMetrics.getBinningWoe());
}
if (columnWeightMetrics != null) {
columnConfig.getColumnStats().setWeightedKs(columnWeightMetrics.getKs());
columnConfig.getColumnStats().setWeightedIv(columnWeightMetrics.getIv());
columnConfig.getColumnStats().setWeightedWoe(columnWeightMetrics.getWoe());
columnConfig.getColumnBinning().setBinWeightedWoe(columnWeightMetrics.getBinningWoe());
}
// columnConfig.setMissingCnt(cnt)
}
use of ml.shifu.shifu.core.BasicStatsCalculator in project shifu by ShifuML.
the class CalculateStatsUDF method exec.
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
TupleFactory tupleFactory = TupleFactory.getInstance();
Integer columnNum = (Integer) input.get(0);
DataBag bag = (DataBag) input.get(1);
BinningDataType dataType;
if (modelConfig.isCategoricalDisabled()) {
dataType = BinningDataType.Numerical;
} else {
if (columnConfigList.get(columnNum).isCategorical()) {
dataType = BinningDataType.Categorical;
} else if (columnConfigList.get(columnNum).isNumerical()) {
dataType = BinningDataType.Numerical;
} else if (modelConfig.isBinningAutoTypeEnabled()) {
// if type is Auto, and the auto type enable is true
dataType = BinningDataType.Auto;
} else {
// if type is Auto, but the auto type enable is false
dataType = BinningDataType.Numerical;
}
}
List<ValueObject> voList = new ArrayList<ValueObject>();
Iterator<Tuple> iterator = bag.iterator();
log.debug("****** The element count in bag is : " + bag.size());
long total = 0l;
long missing = 0l;
while (iterator.hasNext()) {
total++;
Tuple t = iterator.next();
if (t.get(1) == null) {
missing++;
continue;
}
ValueObject vo = new ValueObject();
String valueStr = ((t.get(0) == null) ? "" : t.get(0).toString());
if (dataType.equals(BinningDataType.Numerical)) {
Double value = null;
try {
value = Double.valueOf(valueStr);
} catch (NumberFormatException e) {
// if there are too many log, it will case ReduceTask - `java.lang.OutOfMemoryError: Java heap
// space`
// log.warn("Incorrect data, not numerical - " + valueStr);
missing++;
continue;
}
if (value > valueThreshold) {
log.warn("Exceed Threshold: " + value + " / " + valueThreshold);
missing++;
continue;
}
vo.setValue(value);
} else {
// Categorical or Auto
if (StringUtils.isEmpty(valueStr)) {
missing++;
}
vo.setRaw(valueStr);
}
// do not need to catch exception, see AddColumnNumUDF which have already normalized the weight value
vo.setWeight(Double.valueOf(t.get(2).toString()));
vo.setTag(CommonUtils.trimTag(t.get(1).toString()));
// vo.setScore(Double.valueOf(t.get(2).toString()));
voList.add(vo);
}
if (voList.size() < 10) {
return null;
}
// Calculate Binning
Binning binning = new Binning(modelConfig.getPosTags(), modelConfig.getNegTags(), dataType, voList);
binning.setMaxNumOfBins(modelConfig.getBinningExpectedNum());
binning.setBinningMethod(modelConfig.getBinningMethod());
binning.setAutoTypeThreshold(modelConfig.getBinningAutoTypeThreshold());
binning.setMergeEnabled(modelConfig.isBinningMergeEnabled());
binning.doBinning();
// Calculate Basic Stats
BasicStatsCalculator basicStatsCalculator = new BasicStatsCalculator(binning.getUpdatedVoList(), this.valueThreshold);
ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binning.getBinCountNeg(), binning.getBinCountPos());
// Assemble the results
Tuple tuple = tupleFactory.newTuple();
tuple.append(columnNum);
if (binning.getUpdatedDataType().equals(BinningDataType.Categorical)) {
tuple.append("[" + StringUtils.join(binning.getBinCategory(), CATEGORY_VAL_SEPARATOR) + "]");
} else {
tuple.append(binning.getBinBoundary().toString());
}
tuple.append(binning.getBinCountNeg().toString());
tuple.append(binning.getBinCountPos().toString());
// tuple.append(null);
tuple.append(binning.getBinAvgScore().toString());
tuple.append(binning.getBinPosCaseRate().toString());
tuple.append(df.format(columnCountMetrics.getKs()));
tuple.append(df.format(columnCountMetrics.getIv()));
tuple.append(df.format(basicStatsCalculator.getMax()));
tuple.append(df.format(basicStatsCalculator.getMin()));
tuple.append(df.format(basicStatsCalculator.getMean()));
tuple.append(df.format(basicStatsCalculator.getStdDev()));
if (binning.getUpdatedDataType().equals(BinningDataType.Numerical)) {
tuple.append("N");
} else {
tuple.append("C");
}
tuple.append(df.format(basicStatsCalculator.getMedian()));
tuple.append(df.format(missing));
tuple.append(df.format(total));
tuple.append(df.format((double) missing / total));
tuple.append(binning.getBinWeightedNeg().toString());
tuple.append(binning.getBinWeightedPos().toString());
return tuple;
}
Aggregations