Search in sources :

Example 1 with Binning

use of ml.shifu.shifu.core.Binning in project shifu by ShifuML.

the class StatsCalculateWorker method calculateColumnStats.

/**
 * Do the stats calculation
 *
 * @param columnConfig
 * @param valueObjList
 */
private void calculateColumnStats(ColumnConfig columnConfig, List<ValueObject> valueObjList) {
    if (CollectionUtils.isEmpty(valueObjList)) {
        log.error("No values for column : {}, please check!", columnConfig.getColumnName());
        return;
    }
    BinningDataType dataType;
    if (columnConfig.isNumerical()) {
        dataType = BinningDataType.Numerical;
    } else if (columnConfig.isCategorical()) {
        dataType = BinningDataType.Categorical;
    } else {
        dataType = BinningDataType.Auto;
    }
    // Binning
    Binning binning = new Binning(modelConfig.getPosTags(), modelConfig.getNegTags(), dataType, valueObjList);
    log.info("posTags - {}, negTags - {}, first example tag - {}", modelConfig.getPosTags(), modelConfig.getNegTags(), valueObjList.get(0).getTag());
    binning.setMaxNumOfBins(modelConfig.getBinningExpectedNum());
    binning.setBinningMethod(modelConfig.getBinningMethod());
    binning.setAutoTypeThreshold(modelConfig.getAutoTypeThreshold());
    binning.setMergeEnabled(Boolean.TRUE);
    binning.doBinning();
    // Calculate Basic Stats
    BasicStatsCalculator basicStatsCalculator = new BasicStatsCalculator(binning.getUpdatedVoList(), modelConfig.getNumericalValueThreshold());
    // Calculate KSIV, based on Binning result
    ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binning.getBinCountNeg(), binning.getBinCountPos());
    ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(binning.getBinWeightedNeg(), binning.getBinWeightedPos());
    dataType = binning.getUpdatedDataType();
    if (dataType.equals(BinningDataType.Numerical)) {
        columnConfig.setColumnType(ColumnType.N);
        columnConfig.setBinBoundary(binning.getBinBoundary());
    } else {
        columnConfig.setColumnType(ColumnType.C);
        columnConfig.setBinCategory(binning.getBinCategory());
    }
    columnConfig.setBinCountNeg(binning.getBinCountNeg());
    columnConfig.setBinCountPos(binning.getBinCountPos());
    columnConfig.setBinPosCaseRate(binning.getBinPosCaseRate());
    columnConfig.setMax(basicStatsCalculator.getMax());
    columnConfig.setMin(basicStatsCalculator.getMin());
    columnConfig.setMean(basicStatsCalculator.getMean());
    columnConfig.setStdDev(basicStatsCalculator.getStdDev());
    columnConfig.setMedian(basicStatsCalculator.getMedian());
    columnConfig.setBinWeightedNeg(binning.getBinWeightedNeg());
    columnConfig.setBinWeightedPos(binning.getBinWeightedPos());
    if (columnCountMetrics != null) {
        columnConfig.setKs(columnCountMetrics.getKs());
        columnConfig.setIv(columnCountMetrics.getIv());
        columnConfig.getColumnStats().setWoe(columnCountMetrics.getWoe());
        columnConfig.getColumnBinning().setBinCountWoe(columnCountMetrics.getBinningWoe());
    }
    if (columnWeightMetrics != null) {
        columnConfig.getColumnStats().setWeightedKs(columnWeightMetrics.getKs());
        columnConfig.getColumnStats().setWeightedIv(columnWeightMetrics.getIv());
        columnConfig.getColumnStats().setWeightedWoe(columnWeightMetrics.getWoe());
        columnConfig.getColumnBinning().setBinWeightedWoe(columnWeightMetrics.getBinningWoe());
    }
// columnConfig.setMissingCnt(cnt)
}
Also used : Binning(ml.shifu.shifu.core.Binning) BinningDataType(ml.shifu.shifu.core.Binning.BinningDataType) BasicStatsCalculator(ml.shifu.shifu.core.BasicStatsCalculator) ColumnMetrics(ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics)

Example 2 with Binning

use of ml.shifu.shifu.core.Binning in project shifu by ShifuML.

the class CalculateStatsUDF method exec.

public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }
    TupleFactory tupleFactory = TupleFactory.getInstance();
    Integer columnNum = (Integer) input.get(0);
    DataBag bag = (DataBag) input.get(1);
    BinningDataType dataType;
    if (modelConfig.isCategoricalDisabled()) {
        dataType = BinningDataType.Numerical;
    } else {
        if (columnConfigList.get(columnNum).isCategorical()) {
            dataType = BinningDataType.Categorical;
        } else if (columnConfigList.get(columnNum).isNumerical()) {
            dataType = BinningDataType.Numerical;
        } else if (modelConfig.isBinningAutoTypeEnabled()) {
            // if type is Auto, and the auto type enable is true
            dataType = BinningDataType.Auto;
        } else {
            // if type is Auto, but the auto type enable is false
            dataType = BinningDataType.Numerical;
        }
    }
    List<ValueObject> voList = new ArrayList<ValueObject>();
    Iterator<Tuple> iterator = bag.iterator();
    log.debug("****** The element count in bag is : " + bag.size());
    long total = 0l;
    long missing = 0l;
    while (iterator.hasNext()) {
        total++;
        Tuple t = iterator.next();
        if (t.get(1) == null) {
            missing++;
            continue;
        }
        ValueObject vo = new ValueObject();
        String valueStr = ((t.get(0) == null) ? "" : t.get(0).toString());
        if (dataType.equals(BinningDataType.Numerical)) {
            Double value = null;
            try {
                value = Double.valueOf(valueStr);
            } catch (NumberFormatException e) {
                // if there are too many log, it will case ReduceTask - `java.lang.OutOfMemoryError: Java heap
                // space`
                // log.warn("Incorrect data, not numerical - " + valueStr);
                missing++;
                continue;
            }
            if (value > valueThreshold) {
                log.warn("Exceed Threshold: " + value + " / " + valueThreshold);
                missing++;
                continue;
            }
            vo.setValue(value);
        } else {
            // Categorical or Auto
            if (StringUtils.isEmpty(valueStr)) {
                missing++;
            }
            vo.setRaw(valueStr);
        }
        // do not need to catch exception, see AddColumnNumUDF which have already normalized the weight value
        vo.setWeight(Double.valueOf(t.get(2).toString()));
        vo.setTag(CommonUtils.trimTag(t.get(1).toString()));
        // vo.setScore(Double.valueOf(t.get(2).toString()));
        voList.add(vo);
    }
    if (voList.size() < 10) {
        return null;
    }
    // Calculate Binning
    Binning binning = new Binning(modelConfig.getPosTags(), modelConfig.getNegTags(), dataType, voList);
    binning.setMaxNumOfBins(modelConfig.getBinningExpectedNum());
    binning.setBinningMethod(modelConfig.getBinningMethod());
    binning.setAutoTypeThreshold(modelConfig.getBinningAutoTypeThreshold());
    binning.setMergeEnabled(modelConfig.isBinningMergeEnabled());
    binning.doBinning();
    // Calculate Basic Stats
    BasicStatsCalculator basicStatsCalculator = new BasicStatsCalculator(binning.getUpdatedVoList(), this.valueThreshold);
    ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binning.getBinCountNeg(), binning.getBinCountPos());
    // Assemble the results
    Tuple tuple = tupleFactory.newTuple();
    tuple.append(columnNum);
    if (binning.getUpdatedDataType().equals(BinningDataType.Categorical)) {
        tuple.append("[" + StringUtils.join(binning.getBinCategory(), CATEGORY_VAL_SEPARATOR) + "]");
    } else {
        tuple.append(binning.getBinBoundary().toString());
    }
    tuple.append(binning.getBinCountNeg().toString());
    tuple.append(binning.getBinCountPos().toString());
    // tuple.append(null);
    tuple.append(binning.getBinAvgScore().toString());
    tuple.append(binning.getBinPosCaseRate().toString());
    tuple.append(df.format(columnCountMetrics.getKs()));
    tuple.append(df.format(columnCountMetrics.getIv()));
    tuple.append(df.format(basicStatsCalculator.getMax()));
    tuple.append(df.format(basicStatsCalculator.getMin()));
    tuple.append(df.format(basicStatsCalculator.getMean()));
    tuple.append(df.format(basicStatsCalculator.getStdDev()));
    if (binning.getUpdatedDataType().equals(BinningDataType.Numerical)) {
        tuple.append("N");
    } else {
        tuple.append("C");
    }
    tuple.append(df.format(basicStatsCalculator.getMedian()));
    tuple.append(df.format(missing));
    tuple.append(df.format(total));
    tuple.append(df.format((double) missing / total));
    tuple.append(binning.getBinWeightedNeg().toString());
    tuple.append(binning.getBinWeightedPos().toString());
    return tuple;
}
Also used : DataBag(org.apache.pig.data.DataBag) TupleFactory(org.apache.pig.data.TupleFactory) ArrayList(java.util.ArrayList) BasicStatsCalculator(ml.shifu.shifu.core.BasicStatsCalculator) Binning(ml.shifu.shifu.core.Binning) BinningDataType(ml.shifu.shifu.core.Binning.BinningDataType) ValueObject(ml.shifu.shifu.container.ValueObject) Tuple(org.apache.pig.data.Tuple) ColumnMetrics(ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics)

Aggregations

BasicStatsCalculator (ml.shifu.shifu.core.BasicStatsCalculator)2 Binning (ml.shifu.shifu.core.Binning)2 BinningDataType (ml.shifu.shifu.core.Binning.BinningDataType)2 ColumnMetrics (ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics)2 ArrayList (java.util.ArrayList)1 ValueObject (ml.shifu.shifu.container.ValueObject)1 DataBag (org.apache.pig.data.DataBag)1 Tuple (org.apache.pig.data.Tuple)1 TupleFactory (org.apache.pig.data.TupleFactory)1