Search in sources :

Example 1 with ColumnMetrics

use of ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics in project shifu by ShifuML.

the class UpdateBinningInfoReducer method reduce.

@Override
protected void reduce(IntWritable key, Iterable<BinningInfoWritable> values, Context context) throws IOException, InterruptedException {
    long start = System.currentTimeMillis();
    double sum = 0d;
    double squaredSum = 0d;
    double tripleSum = 0d;
    double quarticSum = 0d;
    double p25th = 0d;
    double median = 0d;
    double p75th = 0d;
    long count = 0L, missingCount = 0L;
    double min = Double.MAX_VALUE, max = Double.MIN_VALUE;
    List<Double> binBoundaryList = null;
    List<String> binCategories = null;
    long[] binCountPos = null;
    long[] binCountNeg = null;
    double[] binWeightPos = null;
    double[] binWeightNeg = null;
    long[] binCountTotal = null;
    int columnConfigIndex = key.get() >= this.columnConfigList.size() ? key.get() % this.columnConfigList.size() : key.get();
    ColumnConfig columnConfig = this.columnConfigList.get(columnConfigIndex);
    HyperLogLogPlus hyperLogLogPlus = null;
    Set<String> fis = new HashSet<String>();
    long totalCount = 0, invalidCount = 0, validNumCount = 0;
    int binSize = 0;
    for (BinningInfoWritable info : values) {
        if (info.isEmpty()) {
            // mapper has no stats, skip it
            continue;
        }
        CountAndFrequentItemsWritable cfiw = info.getCfiw();
        totalCount += cfiw.getCount();
        invalidCount += cfiw.getInvalidCount();
        validNumCount += cfiw.getValidNumCount();
        fis.addAll(cfiw.getFrequetItems());
        if (hyperLogLogPlus == null) {
            hyperLogLogPlus = HyperLogLogPlus.Builder.build(cfiw.getHyperBytes());
        } else {
            try {
                hyperLogLogPlus = (HyperLogLogPlus) hyperLogLogPlus.merge(HyperLogLogPlus.Builder.build(cfiw.getHyperBytes()));
            } catch (CardinalityMergeException e) {
                throw new RuntimeException(e);
            }
        }
        if (columnConfig.isHybrid() && binBoundaryList == null && binCategories == null) {
            binBoundaryList = info.getBinBoundaries();
            binCategories = info.getBinCategories();
            binSize = binBoundaryList.size() + binCategories.size();
            binCountPos = new long[binSize + 1];
            binCountNeg = new long[binSize + 1];
            binWeightPos = new double[binSize + 1];
            binWeightNeg = new double[binSize + 1];
            binCountTotal = new long[binSize + 1];
        } else if (columnConfig.isNumerical() && binBoundaryList == null) {
            binBoundaryList = info.getBinBoundaries();
            binSize = binBoundaryList.size();
            binCountPos = new long[binSize + 1];
            binCountNeg = new long[binSize + 1];
            binWeightPos = new double[binSize + 1];
            binWeightNeg = new double[binSize + 1];
            binCountTotal = new long[binSize + 1];
        } else if (columnConfig.isCategorical() && binCategories == null) {
            binCategories = info.getBinCategories();
            binSize = binCategories.size();
            binCountPos = new long[binSize + 1];
            binCountNeg = new long[binSize + 1];
            binWeightPos = new double[binSize + 1];
            binWeightNeg = new double[binSize + 1];
            binCountTotal = new long[binSize + 1];
        }
        count += info.getTotalCount();
        missingCount += info.getMissingCount();
        // for numeric, such sums are OK, for categorical, such values are all 0, should be updated by using
        // binCountPos and binCountNeg
        sum += info.getSum();
        squaredSum += info.getSquaredSum();
        tripleSum += info.getTripleSum();
        quarticSum += info.getQuarticSum();
        if (Double.compare(max, info.getMax()) < 0) {
            max = info.getMax();
        }
        if (Double.compare(min, info.getMin()) > 0) {
            min = info.getMin();
        }
        for (int i = 0; i < (binSize + 1); i++) {
            binCountPos[i] += info.getBinCountPos()[i];
            binCountNeg[i] += info.getBinCountNeg()[i];
            binWeightPos[i] += info.getBinWeightPos()[i];
            binWeightNeg[i] += info.getBinWeightNeg()[i];
            binCountTotal[i] += info.getBinCountPos()[i];
            binCountTotal[i] += info.getBinCountNeg()[i];
        }
    }
    if (columnConfig.isNumerical()) {
        long p25Count = count / 4;
        long medianCount = p25Count * 2;
        long p75Count = p25Count * 3;
        p25th = min;
        median = min;
        p75th = min;
        int currentCount = 0;
        for (int i = 0; i < binBoundaryList.size(); i++) {
            double left = getCutoffBoundary(binBoundaryList.get(i), max, min);
            double right = ((i == binBoundaryList.size() - 1) ? max : getCutoffBoundary(binBoundaryList.get(i + 1), max, min));
            if (p25Count >= currentCount && p25Count < currentCount + binCountTotal[i]) {
                p25th = ((p25Count - currentCount) / (double) binCountTotal[i]) * (right - left) + left;
            }
            if (medianCount >= currentCount && medianCount < currentCount + binCountTotal[i]) {
                median = ((medianCount - currentCount) / (double) binCountTotal[i]) * (right - left) + left;
            }
            if (p75Count >= currentCount && p75Count < currentCount + binCountTotal[i]) {
                p75th = ((p75Count - currentCount) / (double) binCountTotal[i]) * (right - left) + left;
                // when get 75 percentile stop it
                break;
            }
            currentCount += binCountTotal[i];
        }
        LOG.info("Coloumn num is {}, p25 value is {}, median value is {}, p75 value is {}", columnConfig.getColumnNum(), p25th, median, p75th);
    }
    LOG.info("Coloumn num is {}, columnType value is {}, cateMaxNumBin is {}, binCategory size is {}", columnConfig.getColumnNum(), columnConfig.getColumnType(), modelConfig.getStats().getCateMaxNumBin(), (CollectionUtils.isNotEmpty(columnConfig.getBinCategory()) ? columnConfig.getBinCategory().size() : 0));
    // To merge categorical binning
    if (columnConfig.isCategorical() && modelConfig.getStats().getCateMaxNumBin() > 0 && CollectionUtils.isNotEmpty(binCategories) && binCategories.size() > modelConfig.getStats().getCateMaxNumBin()) {
        // only category size large then expected max bin number
        CateBinningStats cateBinningStats = rebinCategoricalValues(new CateBinningStats(binCategories, binCountPos, binCountNeg, binWeightPos, binWeightNeg));
        LOG.info("For variable - {}, {} bins is rebined to {} bins", columnConfig.getColumnName(), binCategories.size(), cateBinningStats.binCategories.size());
        binCategories = cateBinningStats.binCategories;
        binCountPos = cateBinningStats.binCountPos;
        binCountNeg = cateBinningStats.binCountNeg;
        binWeightPos = cateBinningStats.binWeightPos;
        binWeightNeg = cateBinningStats.binWeightNeg;
    }
    double[] binPosRate;
    if (modelConfig.isRegression()) {
        binPosRate = computePosRate(binCountPos, binCountNeg);
    } else {
        // for multiple classfication, use rate of categories to compute a value
        binPosRate = computeRateForMultiClassfication(binCountPos);
    }
    String binBounString = null;
    if (columnConfig.isHybrid()) {
        if (binCategories.size() > this.maxCateSize) {
            LOG.warn("Column {} {} with invalid bin category size.", key.get(), columnConfig.getColumnName(), binCategories.size());
            return;
        }
        binBounString = binBoundaryList.toString();
        binBounString += Constants.HYBRID_BIN_STR_DILIMETER + Base64Utils.base64Encode("[" + StringUtils.join(binCategories, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]");
    } else if (columnConfig.isCategorical()) {
        if (binCategories.size() > this.maxCateSize) {
            LOG.warn("Column {} {} with invalid bin category size.", key.get(), columnConfig.getColumnName(), binCategories.size());
            return;
        }
        binBounString = Base64Utils.base64Encode("[" + StringUtils.join(binCategories, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]");
        // recompute such value for categorical variables
        min = Double.MAX_VALUE;
        max = Double.MIN_VALUE;
        sum = 0d;
        squaredSum = 0d;
        for (int i = 0; i < binPosRate.length; i++) {
            if (!Double.isNaN(binPosRate[i])) {
                if (Double.compare(max, binPosRate[i]) < 0) {
                    max = binPosRate[i];
                }
                if (Double.compare(min, binPosRate[i]) > 0) {
                    min = binPosRate[i];
                }
                long binCount = binCountPos[i] + binCountNeg[i];
                sum += binPosRate[i] * binCount;
                double squaredVal = binPosRate[i] * binPosRate[i];
                squaredSum += squaredVal * binCount;
                tripleSum += squaredVal * binPosRate[i] * binCount;
                quarticSum += squaredVal * squaredVal * binCount;
            }
        }
    } else {
        if (binBoundaryList.size() == 0) {
            LOG.warn("Column {} {} with invalid bin boundary size.", key.get(), columnConfig.getColumnName(), binBoundaryList.size());
            return;
        }
        binBounString = binBoundaryList.toString();
    }
    ColumnMetrics columnCountMetrics = null;
    ColumnMetrics columnWeightMetrics = null;
    if (modelConfig.isRegression()) {
        columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binCountNeg, binCountPos);
        columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(binWeightNeg, binWeightPos);
    }
    // To make it be consistent with SPDT, missingCount is excluded to compute mean, stddev ...
    long realCount = this.statsExcludeMissingValue ? (count - missingCount) : count;
    double mean = sum / realCount;
    double stdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / realCount + EPS) / (realCount - 1)));
    double aStdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / realCount + EPS) / realCount));
    double skewness = ColumnStatsCalculator.computeSkewness(realCount, mean, aStdDev, sum, squaredSum, tripleSum);
    double kurtosis = ColumnStatsCalculator.computeKurtosis(realCount, mean, aStdDev, sum, squaredSum, tripleSum, quarticSum);
    sb.append(key.get()).append(Constants.DEFAULT_DELIMITER).append(binBounString).append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binCountNeg)).append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binCountPos)).append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(new double[0])).append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binPosRate)).append(Constants.DEFAULT_DELIMITER).append(columnCountMetrics == null ? "" : df.format(columnCountMetrics.getKs())).append(Constants.DEFAULT_DELIMITER).append(columnCountMetrics == null ? "" : df.format(columnCountMetrics.getIv())).append(Constants.DEFAULT_DELIMITER).append(df.format(max)).append(Constants.DEFAULT_DELIMITER).append(df.format(min)).append(Constants.DEFAULT_DELIMITER).append(df.format(mean)).append(Constants.DEFAULT_DELIMITER).append(df.format(stdDev)).append(Constants.DEFAULT_DELIMITER).append(columnConfig.getColumnType().toString()).append(Constants.DEFAULT_DELIMITER).append(median).append(Constants.DEFAULT_DELIMITER).append(missingCount).append(Constants.DEFAULT_DELIMITER).append(count).append(Constants.DEFAULT_DELIMITER).append(missingCount * 1.0d / count).append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binWeightNeg)).append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binWeightPos)).append(Constants.DEFAULT_DELIMITER).append(columnCountMetrics == null ? "" : columnCountMetrics.getWoe()).append(Constants.DEFAULT_DELIMITER).append(columnWeightMetrics == null ? "" : columnWeightMetrics.getWoe()).append(Constants.DEFAULT_DELIMITER).append(columnWeightMetrics == null ? "" : columnWeightMetrics.getKs()).append(Constants.DEFAULT_DELIMITER).append(columnWeightMetrics == null ? "" : columnWeightMetrics.getIv()).append(Constants.DEFAULT_DELIMITER).append(columnCountMetrics == null ? Arrays.toString(new double[binSize + 1]) : columnCountMetrics.getBinningWoe().toString()).append(Constants.DEFAULT_DELIMITER).append(columnWeightMetrics == null ? Arrays.toString(new double[binSize + 1]) : // bin weighted WOE
    columnWeightMetrics.getBinningWoe().toString()).append(Constants.DEFAULT_DELIMITER).append(// skewness
    skewness).append(Constants.DEFAULT_DELIMITER).append(// kurtosis
    kurtosis).append(Constants.DEFAULT_DELIMITER).append(// total count
    totalCount).append(Constants.DEFAULT_DELIMITER).append(// invalid count
    invalidCount).append(Constants.DEFAULT_DELIMITER).append(// valid num count
    validNumCount).append(Constants.DEFAULT_DELIMITER).append(// cardinality
    hyperLogLogPlus.cardinality()).append(Constants.DEFAULT_DELIMITER).append(// frequent items
    Base64Utils.base64Encode(limitedFrequentItems(fis))).append(Constants.DEFAULT_DELIMITER).append(// the 25 percentile value
    p25th).append(Constants.DEFAULT_DELIMITER).append(p75th);
    outputValue.set(sb.toString());
    context.write(NullWritable.get(), outputValue);
    sb.delete(0, sb.length());
    LOG.debug("Time:{}", (System.currentTimeMillis() - start));
}
Also used : CountAndFrequentItemsWritable(ml.shifu.shifu.core.autotype.CountAndFrequentItemsWritable) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) CardinalityMergeException(com.clearspring.analytics.stream.cardinality.CardinalityMergeException) HyperLogLogPlus(com.clearspring.analytics.stream.cardinality.HyperLogLogPlus) ColumnMetrics(ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics)

Example 2 with ColumnMetrics

use of ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics in project shifu by ShifuML.

the class CalculateNewStatsUDF method exec.

/*
     * (non-Javadoc)
     * 
     * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
     */
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null) {
        return null;
    }
    Integer columnId = (Integer) input.get(0);
    DataBag databag = (DataBag) input.get(1);
    String binningDataInfo = (String) input.get(3);
    log.info("start to process column id - " + columnId.toString());
    ColumnConfig columnConfig = super.columnConfigList.get(columnId);
    AbstractVarStats varstats = AbstractVarStats.getVarStatsInst(modelConfig, columnConfig, valueThreshold);
    varstats.runVarStats(binningDataInfo, databag);
    log.info("after to process column id - " + columnId.toString());
    ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(columnConfig.getBinCountNeg(), columnConfig.getBinCountPos());
    ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(columnConfig.getBinWeightedNeg(), columnConfig.getBinWeightedPos());
    // Assemble the results
    Tuple tuple = TupleFactory.getInstance().newTuple();
    tuple.append(columnId);
    if (columnConfig.isCategorical()) {
        if (columnConfig.getBinCategory().size() == 0 || columnConfig.getBinCategory().size() > this.maxCategorySize) {
            return null;
        }
        String binCategory = "[" + StringUtils.join(columnConfig.getBinCategory(), CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]";
        tuple.append(Base64Utils.base64Encode(binCategory));
    } else {
        if (columnConfig.getBinBoundary().size() == 1) {
            return null;
        }
        tuple.append(columnConfig.getBinBoundary().toString());
    }
    tuple.append(columnConfig.getBinCountNeg().toString());
    tuple.append(columnConfig.getBinCountPos().toString());
    tuple.append(columnConfig.getBinAvgScore().toString());
    tuple.append(columnConfig.getBinPosRate().toString());
    tuple.append(df.format(columnCountMetrics.getKs()));
    tuple.append(df.format(columnCountMetrics.getIv()));
    tuple.append(df.format(columnConfig.getColumnStats().getMax()));
    tuple.append(df.format(columnConfig.getColumnStats().getMin()));
    tuple.append(df.format(columnConfig.getColumnStats().getMean()));
    tuple.append(df.format(columnConfig.getColumnStats().getStdDev()));
    if (columnConfig.isCategorical()) {
        tuple.append("C");
    } else {
        tuple.append("N");
    }
    tuple.append(df.format(columnConfig.getColumnStats().getMedian()));
    tuple.append(columnConfig.getMissingCount());
    tuple.append(columnConfig.getTotalCount());
    tuple.append(df.format(columnConfig.getMissingPercentage()));
    tuple.append(columnConfig.getBinWeightedNeg().toString());
    tuple.append(columnConfig.getBinWeightedPos().toString());
    tuple.append(columnCountMetrics.getWoe());
    tuple.append(columnWeightMetrics.getWoe());
    tuple.append(df.format(columnWeightMetrics.getKs()));
    tuple.append(df.format(columnWeightMetrics.getIv()));
    tuple.append(columnCountMetrics.getBinningWoe().toString());
    tuple.append(columnWeightMetrics.getBinningWoe().toString());
    tuple.append(columnConfig.getColumnStats().getSkewness());
    tuple.append(columnConfig.getColumnStats().getKurtosis());
    return tuple;
}
Also used : DataBag(org.apache.pig.data.DataBag) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) AbstractVarStats(ml.shifu.shifu.udf.stats.AbstractVarStats) ColumnMetrics(ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics) Tuple(org.apache.pig.data.Tuple)

Example 3 with ColumnMetrics

use of ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics in project shifu by ShifuML.

the class StatsCalculateWorker method calculateColumnStats.

/**
 * Do the stats calculation
 *
 * @param columnConfig
 * @param valueObjList
 */
private void calculateColumnStats(ColumnConfig columnConfig, List<ValueObject> valueObjList) {
    if (CollectionUtils.isEmpty(valueObjList)) {
        log.error("No values for column : {}, please check!", columnConfig.getColumnName());
        return;
    }
    BinningDataType dataType;
    if (columnConfig.isNumerical()) {
        dataType = BinningDataType.Numerical;
    } else if (columnConfig.isCategorical()) {
        dataType = BinningDataType.Categorical;
    } else {
        dataType = BinningDataType.Auto;
    }
    // Binning
    Binning binning = new Binning(modelConfig.getPosTags(), modelConfig.getNegTags(), dataType, valueObjList);
    log.info("posTags - {}, negTags - {}, first example tag - {}", modelConfig.getPosTags(), modelConfig.getNegTags(), valueObjList.get(0).getTag());
    binning.setMaxNumOfBins(modelConfig.getBinningExpectedNum());
    binning.setBinningMethod(modelConfig.getBinningMethod());
    binning.setAutoTypeThreshold(modelConfig.getAutoTypeThreshold());
    binning.setMergeEnabled(Boolean.TRUE);
    binning.doBinning();
    // Calculate Basic Stats
    BasicStatsCalculator basicStatsCalculator = new BasicStatsCalculator(binning.getUpdatedVoList(), modelConfig.getNumericalValueThreshold());
    // Calculate KSIV, based on Binning result
    ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binning.getBinCountNeg(), binning.getBinCountPos());
    ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(binning.getBinWeightedNeg(), binning.getBinWeightedPos());
    dataType = binning.getUpdatedDataType();
    if (dataType.equals(BinningDataType.Numerical)) {
        columnConfig.setColumnType(ColumnType.N);
        columnConfig.setBinBoundary(binning.getBinBoundary());
    } else {
        columnConfig.setColumnType(ColumnType.C);
        columnConfig.setBinCategory(binning.getBinCategory());
    }
    columnConfig.setBinCountNeg(binning.getBinCountNeg());
    columnConfig.setBinCountPos(binning.getBinCountPos());
    columnConfig.setBinPosCaseRate(binning.getBinPosCaseRate());
    columnConfig.setMax(basicStatsCalculator.getMax());
    columnConfig.setMin(basicStatsCalculator.getMin());
    columnConfig.setMean(basicStatsCalculator.getMean());
    columnConfig.setStdDev(basicStatsCalculator.getStdDev());
    columnConfig.setMedian(basicStatsCalculator.getMedian());
    columnConfig.setBinWeightedNeg(binning.getBinWeightedNeg());
    columnConfig.setBinWeightedPos(binning.getBinWeightedPos());
    if (columnCountMetrics != null) {
        columnConfig.setKs(columnCountMetrics.getKs());
        columnConfig.setIv(columnCountMetrics.getIv());
        columnConfig.getColumnStats().setWoe(columnCountMetrics.getWoe());
        columnConfig.getColumnBinning().setBinCountWoe(columnCountMetrics.getBinningWoe());
    }
    if (columnWeightMetrics != null) {
        columnConfig.getColumnStats().setWeightedKs(columnWeightMetrics.getKs());
        columnConfig.getColumnStats().setWeightedIv(columnWeightMetrics.getIv());
        columnConfig.getColumnStats().setWeightedWoe(columnWeightMetrics.getWoe());
        columnConfig.getColumnBinning().setBinWeightedWoe(columnWeightMetrics.getBinningWoe());
    }
// columnConfig.setMissingCnt(cnt)
}
Also used : Binning(ml.shifu.shifu.core.Binning) BinningDataType(ml.shifu.shifu.core.Binning.BinningDataType) BasicStatsCalculator(ml.shifu.shifu.core.BasicStatsCalculator) ColumnMetrics(ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics)

Example 4 with ColumnMetrics

use of ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics in project shifu by ShifuML.

the class KSIVCalculatorTest method test.

@Test
public void test() {
    List<Integer> a = Arrays.asList(new Integer[] { 1, 2, 3, 4, 5, 6 });
    List<Integer> b = Arrays.asList(new Integer[] { 2, 2, 5, 5, 5, 5 });
    ColumnMetrics columnMetrics = ColumnStatsCalculator.calculateColumnMetrics(a, b);
    Assert.assertEquals(df.format(columnMetrics.getIv()), "0.08");
    Assert.assertEquals(df.format(columnMetrics.getKs()), "10.71");
}
Also used : ColumnMetrics(ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics) Test(org.testng.annotations.Test)

Example 5 with ColumnMetrics

use of ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics in project shifu by ShifuML.

the class CalculateStatsUDF method exec.

public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }
    TupleFactory tupleFactory = TupleFactory.getInstance();
    Integer columnNum = (Integer) input.get(0);
    DataBag bag = (DataBag) input.get(1);
    BinningDataType dataType;
    if (modelConfig.isCategoricalDisabled()) {
        dataType = BinningDataType.Numerical;
    } else {
        if (columnConfigList.get(columnNum).isCategorical()) {
            dataType = BinningDataType.Categorical;
        } else if (columnConfigList.get(columnNum).isNumerical()) {
            dataType = BinningDataType.Numerical;
        } else if (modelConfig.isBinningAutoTypeEnabled()) {
            // if type is Auto, and the auto type enable is true
            dataType = BinningDataType.Auto;
        } else {
            // if type is Auto, but the auto type enable is false
            dataType = BinningDataType.Numerical;
        }
    }
    List<ValueObject> voList = new ArrayList<ValueObject>();
    Iterator<Tuple> iterator = bag.iterator();
    log.debug("****** The element count in bag is : " + bag.size());
    long total = 0l;
    long missing = 0l;
    while (iterator.hasNext()) {
        total++;
        Tuple t = iterator.next();
        if (t.get(1) == null) {
            missing++;
            continue;
        }
        ValueObject vo = new ValueObject();
        String valueStr = ((t.get(0) == null) ? "" : t.get(0).toString());
        if (dataType.equals(BinningDataType.Numerical)) {
            Double value = null;
            try {
                value = Double.valueOf(valueStr);
            } catch (NumberFormatException e) {
                // if there are too many log, it will case ReduceTask - `java.lang.OutOfMemoryError: Java heap
                // space`
                // log.warn("Incorrect data, not numerical - " + valueStr);
                missing++;
                continue;
            }
            if (value > valueThreshold) {
                log.warn("Exceed Threshold: " + value + " / " + valueThreshold);
                missing++;
                continue;
            }
            vo.setValue(value);
        } else {
            // Categorical or Auto
            if (StringUtils.isEmpty(valueStr)) {
                missing++;
            }
            vo.setRaw(valueStr);
        }
        // do not need to catch exception, see AddColumnNumUDF which have already normalized the weight value
        vo.setWeight(Double.valueOf(t.get(2).toString()));
        vo.setTag(CommonUtils.trimTag(t.get(1).toString()));
        // vo.setScore(Double.valueOf(t.get(2).toString()));
        voList.add(vo);
    }
    if (voList.size() < 10) {
        return null;
    }
    // Calculate Binning
    Binning binning = new Binning(modelConfig.getPosTags(), modelConfig.getNegTags(), dataType, voList);
    binning.setMaxNumOfBins(modelConfig.getBinningExpectedNum());
    binning.setBinningMethod(modelConfig.getBinningMethod());
    binning.setAutoTypeThreshold(modelConfig.getBinningAutoTypeThreshold());
    binning.setMergeEnabled(modelConfig.isBinningMergeEnabled());
    binning.doBinning();
    // Calculate Basic Stats
    BasicStatsCalculator basicStatsCalculator = new BasicStatsCalculator(binning.getUpdatedVoList(), this.valueThreshold);
    ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binning.getBinCountNeg(), binning.getBinCountPos());
    // Assemble the results
    Tuple tuple = tupleFactory.newTuple();
    tuple.append(columnNum);
    if (binning.getUpdatedDataType().equals(BinningDataType.Categorical)) {
        tuple.append("[" + StringUtils.join(binning.getBinCategory(), CATEGORY_VAL_SEPARATOR) + "]");
    } else {
        tuple.append(binning.getBinBoundary().toString());
    }
    tuple.append(binning.getBinCountNeg().toString());
    tuple.append(binning.getBinCountPos().toString());
    // tuple.append(null);
    tuple.append(binning.getBinAvgScore().toString());
    tuple.append(binning.getBinPosCaseRate().toString());
    tuple.append(df.format(columnCountMetrics.getKs()));
    tuple.append(df.format(columnCountMetrics.getIv()));
    tuple.append(df.format(basicStatsCalculator.getMax()));
    tuple.append(df.format(basicStatsCalculator.getMin()));
    tuple.append(df.format(basicStatsCalculator.getMean()));
    tuple.append(df.format(basicStatsCalculator.getStdDev()));
    if (binning.getUpdatedDataType().equals(BinningDataType.Numerical)) {
        tuple.append("N");
    } else {
        tuple.append("C");
    }
    tuple.append(df.format(basicStatsCalculator.getMedian()));
    tuple.append(df.format(missing));
    tuple.append(df.format(total));
    tuple.append(df.format((double) missing / total));
    tuple.append(binning.getBinWeightedNeg().toString());
    tuple.append(binning.getBinWeightedPos().toString());
    return tuple;
}
Also used : DataBag(org.apache.pig.data.DataBag) TupleFactory(org.apache.pig.data.TupleFactory) ArrayList(java.util.ArrayList) BasicStatsCalculator(ml.shifu.shifu.core.BasicStatsCalculator) Binning(ml.shifu.shifu.core.Binning) BinningDataType(ml.shifu.shifu.core.Binning.BinningDataType) ValueObject(ml.shifu.shifu.container.ValueObject) Tuple(org.apache.pig.data.Tuple) ColumnMetrics(ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics)

Aggregations

ColumnMetrics (ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics)5 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)2 BasicStatsCalculator (ml.shifu.shifu.core.BasicStatsCalculator)2 Binning (ml.shifu.shifu.core.Binning)2 BinningDataType (ml.shifu.shifu.core.Binning.BinningDataType)2 DataBag (org.apache.pig.data.DataBag)2 Tuple (org.apache.pig.data.Tuple)2 CardinalityMergeException (com.clearspring.analytics.stream.cardinality.CardinalityMergeException)1 HyperLogLogPlus (com.clearspring.analytics.stream.cardinality.HyperLogLogPlus)1 ArrayList (java.util.ArrayList)1 ValueObject (ml.shifu.shifu.container.ValueObject)1 CountAndFrequentItemsWritable (ml.shifu.shifu.core.autotype.CountAndFrequentItemsWritable)1 AbstractVarStats (ml.shifu.shifu.udf.stats.AbstractVarStats)1 TupleFactory (org.apache.pig.data.TupleFactory)1 Test (org.testng.annotations.Test)1