Search in sources :

Example 1 with CategoricalBinning

use of ml.shifu.shifu.core.binning.CategoricalBinning in project shifu by ShifuML.

the class BinningDataMergeUDF method exec.

/*
     * (non-Javadoc)
     * 
     * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
     */
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null) {
        return null;
    }
    Integer columnId = (Integer) input.get(0);
    DataBag databag = (DataBag) input.get(1);
    int corrColumnId = columnId;
    if (corrColumnId >= super.columnConfigList.size()) {
        corrColumnId = corrColumnId % super.columnConfigList.size();
    }
    ColumnConfig columnConfig = super.columnConfigList.get(corrColumnId);
    AbstractBinning<?> binning = null;
    AbstractBinning<?> backupBinning = null;
    log.info("Start merging bin info for columnId - " + columnId + ", the bag size is - " + databag.size());
    Iterator<Tuple> iterator = databag.iterator();
    while (iterator.hasNext()) {
        Tuple element = iterator.next();
        if (element == null || element.size() < 2) {
            continue;
        }
        String objValStr = (String) element.get(1);
        String hybridCateValStr = null;
        long start = System.currentTimeMillis();
        // for hybrid, split
        if (columnConfig.isHybrid()) {
            String[] splits = CommonUtils.split(objValStr, Constants.HYBRID_BIN_STR_DILIMETER);
            objValStr = splits[0];
            hybridCateValStr = splits[1];
        }
        AbstractBinning<?> partialBinning = AbstractBinning.constructBinningFromStr(modelConfig, columnConfig, objValStr);
        AbstractBinning<?> partialBackupBinning = null;
        if (columnConfig.isHybrid()) {
            partialBackupBinning = new CategoricalBinning();
            partialBackupBinning.stringToObj(hybridCateValStr);
        }
        log.info("constructBinningFromStr: " + (System.currentTimeMillis() - start) + "ms");
        start = System.currentTimeMillis();
        if (binning == null) {
            binning = partialBinning;
            if (columnConfig.isHybrid()) {
                backupBinning = partialBackupBinning;
            }
        } else {
            binning.mergeBin(partialBinning);
            if (columnConfig.isHybrid()) {
                backupBinning.mergeBin(partialBackupBinning);
            }
        }
        log.info("mergeBin: " + (System.currentTimeMillis() - start) + "ms");
    }
    Tuple output = TupleFactory.getInstance().newTuple(2);
    output.set(0, columnId);
    List<?> binFields = binning.getDataBin();
    // it will consume too much memory when join them together, that will cause OOM exception
    if (columnConfig.isCategorical() && binFields.size() > this.maxCategorySize) {
        log.warn(columnId + " " + columnConfig.getColumnName() + " is over maximal categorical size: " + this.maxCategorySize);
        output.set(1, "");
    } else {
        if (columnConfig.isHybrid()) {
            String finalBinStr = StringUtils.join(binFields, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR);
            finalBinStr += Constants.HYBRID_BIN_STR_DILIMETER + StringUtils.join(backupBinning.getDataBin(), CalculateStatsUDF.CATEGORY_VAL_SEPARATOR);
            output.set(1, finalBinStr);
        } else {
            output.set(1, StringUtils.join(binFields, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR));
        }
    }
    log.info("Finish merging bin info for columnId - " + columnId);
    return output;
}
Also used : DataBag(org.apache.pig.data.DataBag) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) CategoricalBinning(ml.shifu.shifu.core.binning.CategoricalBinning) Tuple(org.apache.pig.data.Tuple)

Example 2 with CategoricalBinning

use of ml.shifu.shifu.core.binning.CategoricalBinning in project shifu by ShifuML.

the class BinningDataUDF method exec.

/*
     * (non-Javadoc)
     * 
     * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
     */
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() < 2) {
        return null;
    }
    Integer columnId = (Integer) input.get(0);
    DataBag databag = (DataBag) input.get(1);
    ColumnConfig columnConfig = super.columnConfigList.get(columnId);
    AbstractBinning<?> binning = null;
    if (columnConfig.isCategorical()) {
        binning = new CategoricalBinning(-1, super.modelConfig.getMissingOrInvalidValues(), this.maxCategorySize);
    } else {
        if (super.modelConfig.getBinningMethod().equals(BinningMethod.EqualInterval)) {
            binning = new EqualIntervalBinning(modelConfig.getStats().getMaxNumBin());
        } else {
            switch(this.modelConfig.getBinningAlgorithm()) {
                case Native:
                    log.info("Invoke Native binning method, memory cosuming!!");
                    // always merge bins
                    binning = new NativeBinning(modelConfig.getStats().getMaxNumBin(), true);
                    break;
                case SPDT:
                case SPDTI:
                    log.info("Invoke SPDT(Streaming Parallel Decision Tree) binning method, ");
                    binning = new EqualPopulationBinning(modelConfig.getStats().getMaxNumBin());
                    break;
                case MunroPat:
                case MunroPatI:
                    log.info("Invoke Munro & Paterson selecting algorithm");
                    binning = new MunroPatBinning(modelConfig.getStats().getMaxNumBin());
                    break;
                default:
                    log.info("Default: Invoke Munro & Paterson selecting algorithm");
                    binning = new MunroPatBinning(modelConfig.getStats().getMaxNumBin());
                    break;
            }
        }
    }
    Iterator<Tuple> iterator = databag.iterator();
    while (iterator.hasNext()) {
        Tuple element = iterator.next();
        if (element == null || element.size() < 2) {
            continue;
        }
        Object value = element.get(1);
        if (value != null) {
            binning.addData(value.toString());
        }
    }
    Tuple output = TupleFactory.getInstance().newTuple(2);
    output.set(0, columnId);
    // Do check here. It's because if there are too many value for categorical variable,
    // it will consume too much memory when join them together, that will cause OOM exception
    List<?> dataBin = binning.getDataBin();
    if (dataBin.size() > this.maxCategorySize) {
        output.set(1, "");
    } else {
        output.set(1, StringUtils.join(dataBin, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR));
    }
    log.info("Finish merging bin info for columnId - " + columnId);
    return output;
}
Also used : EqualIntervalBinning(ml.shifu.shifu.core.binning.EqualIntervalBinning) DataBag(org.apache.pig.data.DataBag) NativeBinning(ml.shifu.shifu.core.binning.NativeBinning) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) EqualPopulationBinning(ml.shifu.shifu.core.binning.EqualPopulationBinning) MunroPatBinning(ml.shifu.shifu.core.binning.MunroPatBinning) CategoricalBinning(ml.shifu.shifu.core.binning.CategoricalBinning) Tuple(org.apache.pig.data.Tuple)

Example 3 with CategoricalBinning

use of ml.shifu.shifu.core.binning.CategoricalBinning in project shifu by ShifuML.

the class BinningPartialDataUDF method exec.

/*
     * (non-Javadoc)
     * 
     * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
     */
@Override
public String exec(Tuple input) throws IOException {
    if (input == null) {
        return null;
    }
    DataBag databag = (DataBag) input.get(0);
    Iterator<Tuple> iterator = databag.iterator();
    while (iterator.hasNext()) {
        Tuple element = iterator.next();
        if (element == null) {
            continue;
        }
        if (columnId < 0) {
            columnId = (Integer) element.get(0);
            if (columnId >= super.columnConfigList.size()) {
                columnId = columnId % super.columnConfigList.size();
            }
            columnConfig = super.columnConfigList.get(columnId);
            if (columnConfig.isCategorical()) {
                binning = new CategoricalBinning(-1, modelConfig.getMissingOrInvalidValues(), this.maxCategorySize);
            } else {
                if (super.modelConfig.getBinningMethod().equals(BinningMethod.EqualInterval)) {
                    binning = new EqualIntervalBinning(modelConfig.getStats().getMaxNumBin() > 0 ? modelConfig.getStats().getMaxNumBin() : 1024, modelConfig.getMissingOrInvalidValues());
                } else {
                    binning = new EqualPopulationBinning(modelConfig.getStats().getMaxNumBin() > 0 ? modelConfig.getStats().getMaxNumBin() : 1024, modelConfig.getMissingOrInvalidValues());
                }
            }
            if (columnConfig.isHybrid()) {
                this.backUpbinning = new CategoricalBinning(-1, modelConfig.getMissingOrInvalidValues(), this.maxCategorySize);
            }
        }
        Object value = element.get(1);
        if (value != null) {
            String valStr = value.toString();
            if (isWeightBinningMethod() && binning instanceof EqualPopulationBinning) {
                ((EqualPopulationBinning) binning).addData(valStr, (Double) element.get(AddColumnNumUDF.COLUMN_WEIGHT_INDX));
            } else {
                binning.addData(valStr);
            }
            if (this.columnConfig.isHybrid()) {
                // missing value and not number value go to categorical binning
                double douVal = BinUtils.parseNumber(valStr);
                Double hybridThreshould = this.columnConfig.getHybridThreshold();
                if (hybridThreshould == null) {
                    hybridThreshould = Double.NEGATIVE_INFINITY;
                }
                // douVal < hybridThreshould which will also be set to category
                boolean isCategory = Double.isNaN(douVal) || douVal < hybridThreshould;
                if (douVal < hybridThreshould) {
                    log.warn("douVal " + douVal + ", threshold " + hybridThreshould + ", column {}" + columnConfig.getColumnName());
                }
                if (binning.isMissingVal(valStr) || isCategory) {
                    this.backUpbinning.addData(valStr);
                }
            }
        }
    }
    String binningObjStr = ((binning == null) ? null : binning.objToString());
    if (this.columnConfig.isHybrid()) {
        binningObjStr += Constants.HYBRID_BIN_STR_DILIMETER + this.backUpbinning.objToString();
    }
    cleanUp();
    return binningObjStr;
}
Also used : EqualIntervalBinning(ml.shifu.shifu.core.binning.EqualIntervalBinning) DataBag(org.apache.pig.data.DataBag) EqualPopulationBinning(ml.shifu.shifu.core.binning.EqualPopulationBinning) CategoricalBinning(ml.shifu.shifu.core.binning.CategoricalBinning) Tuple(org.apache.pig.data.Tuple)

Aggregations

CategoricalBinning (ml.shifu.shifu.core.binning.CategoricalBinning)3 DataBag (org.apache.pig.data.DataBag)3 Tuple (org.apache.pig.data.Tuple)3 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)2 EqualIntervalBinning (ml.shifu.shifu.core.binning.EqualIntervalBinning)2 EqualPopulationBinning (ml.shifu.shifu.core.binning.EqualPopulationBinning)2 MunroPatBinning (ml.shifu.shifu.core.binning.MunroPatBinning)1 NativeBinning (ml.shifu.shifu.core.binning.NativeBinning)1