Search in sources :

Example 1 with AbstractVarStats

use of ml.shifu.shifu.udf.stats.AbstractVarStats in project shifu by ShifuML.

the class CalculateNewStatsUDF method exec.

/*
     * (non-Javadoc)
     * 
     * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
     */
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null) {
        return null;
    }
    Integer columnId = (Integer) input.get(0);
    DataBag databag = (DataBag) input.get(1);
    String binningDataInfo = (String) input.get(3);
    log.info("start to process column id - " + columnId.toString());
    ColumnConfig columnConfig = super.columnConfigList.get(columnId);
    AbstractVarStats varstats = AbstractVarStats.getVarStatsInst(modelConfig, columnConfig, valueThreshold);
    varstats.runVarStats(binningDataInfo, databag);
    log.info("after to process column id - " + columnId.toString());
    ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(columnConfig.getBinCountNeg(), columnConfig.getBinCountPos());
    ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(columnConfig.getBinWeightedNeg(), columnConfig.getBinWeightedPos());
    // Assemble the results
    Tuple tuple = TupleFactory.getInstance().newTuple();
    tuple.append(columnId);
    if (columnConfig.isCategorical()) {
        if (columnConfig.getBinCategory().size() == 0 || columnConfig.getBinCategory().size() > this.maxCategorySize) {
            return null;
        }
        String binCategory = "[" + StringUtils.join(columnConfig.getBinCategory(), CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]";
        tuple.append(Base64Utils.base64Encode(binCategory));
    } else {
        if (columnConfig.getBinBoundary().size() == 1) {
            return null;
        }
        tuple.append(columnConfig.getBinBoundary().toString());
    }
    tuple.append(columnConfig.getBinCountNeg().toString());
    tuple.append(columnConfig.getBinCountPos().toString());
    tuple.append(columnConfig.getBinAvgScore().toString());
    tuple.append(columnConfig.getBinPosRate().toString());
    tuple.append(df.format(columnCountMetrics.getKs()));
    tuple.append(df.format(columnCountMetrics.getIv()));
    tuple.append(df.format(columnConfig.getColumnStats().getMax()));
    tuple.append(df.format(columnConfig.getColumnStats().getMin()));
    tuple.append(df.format(columnConfig.getColumnStats().getMean()));
    tuple.append(df.format(columnConfig.getColumnStats().getStdDev()));
    if (columnConfig.isCategorical()) {
        tuple.append("C");
    } else {
        tuple.append("N");
    }
    tuple.append(df.format(columnConfig.getColumnStats().getMedian()));
    tuple.append(columnConfig.getMissingCount());
    tuple.append(columnConfig.getTotalCount());
    tuple.append(df.format(columnConfig.getMissingPercentage()));
    tuple.append(columnConfig.getBinWeightedNeg().toString());
    tuple.append(columnConfig.getBinWeightedPos().toString());
    tuple.append(columnCountMetrics.getWoe());
    tuple.append(columnWeightMetrics.getWoe());
    tuple.append(df.format(columnWeightMetrics.getKs()));
    tuple.append(df.format(columnWeightMetrics.getIv()));
    tuple.append(columnCountMetrics.getBinningWoe().toString());
    tuple.append(columnWeightMetrics.getBinningWoe().toString());
    tuple.append(columnConfig.getColumnStats().getSkewness());
    tuple.append(columnConfig.getColumnStats().getKurtosis());
    return tuple;
}
Also used : DataBag(org.apache.pig.data.DataBag) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) AbstractVarStats(ml.shifu.shifu.udf.stats.AbstractVarStats) ColumnMetrics(ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics) Tuple(org.apache.pig.data.Tuple)

Aggregations

ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)1 ColumnMetrics (ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics)1 AbstractVarStats (ml.shifu.shifu.udf.stats.AbstractVarStats)1 DataBag (org.apache.pig.data.DataBag)1 Tuple (org.apache.pig.data.Tuple)1