Search in sources :

Example 51 with ColumnConfig

use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.

the class NormalizeParquetUDF method exec.

@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }
    // do data sampling. Unselected data or data with invalid tag will be filtered out.
    Object tag = input.get(tagColumnNum);
    if (tag == null) {
        log.warn("The tag is NULL, just skip it!!");
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    final String rawTag = CommonUtils.trimTag(tag.toString());
    boolean isNotSampled = DataSampler.isNotSampled(posTags, negTags, modelConfig.getNormalizeSampleRate(), modelConfig.isNormalizeSampleNegOnly(), rawTag);
    if (isNotSampled) {
        return null;
    }
    // append tuple with tag, normalized value.
    Tuple tuple = TupleFactory.getInstance().newTuple();
    final NormType normType = modelConfig.getNormalizeType();
    for (int i = 0; i < input.size(); i++) {
        ColumnConfig config = columnConfigList.get(i);
        String val = (input.get(i) == null) ? "" : input.get(i).toString();
        // load variables for weight calculating.
        if (weightExpr != null) {
            weightContext.set(config.getColumnName(), val);
        }
        // check tag type.
        if (tagColumnNum == i) {
            String tagType = tagTypeCheck(posTags, negTags, rawTag);
            if (tagType == null) {
                log.error("Invalid data! The target value is not listed - " + rawTag);
                return null;
            }
            tuple.append(Integer.parseInt(tagType));
            continue;
        }
        // append normalize data.
        if (!CommonUtils.isGoodCandidate(config, super.hasCandidates)) {
            tuple.append((Double) null);
        } else {
            if (CommonUtils.isTreeModel(this.alg)) {
                Double normVal = 0d;
                if (config.isCategorical()) {
                    tuple.append(val);
                } else {
                    try {
                        normVal = Double.parseDouble(val);
                    } catch (Exception e) {
                        log.debug("Not decimal format " + val + ", using default!");
                        normVal = Normalizer.defaultMissingValue(config);
                    }
                }
                tuple.append(normVal);
            } else {
                List<Double> normVals = Normalizer.normalize(config, val, cutoff, normType);
                for (Double normVal : normVals) {
                    tuple.append(normVal);
                }
            }
        }
    }
    // append tuple with weight.
    double weight = evaluateWeight(weightExpr, weightContext);
    tuple.append(weight);
    return tuple;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) NormType(ml.shifu.shifu.container.obj.ModelNormalizeConf.NormType) Tuple(org.apache.pig.data.Tuple) IOException(java.io.IOException)

Example 52 with ColumnConfig

use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.

the class NormalizeUDF method exec.

@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }
    Object tag = input.get(tagColumnNum);
    if (tag == null) {
        log.warn("The tag is NULL, just skip it!!");
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    final String rawTag = CommonUtils.trimTag(tag.toString());
    // make sure all invalid tag record are filter out
    if (!isLinearTarget && !super.tagSet.contains(rawTag)) {
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    // if(!isLinearTarget && !this.isForClean) {
    if (!isLinearTarget) {
        // do data sampling. Unselected data or data with invalid tag will be filtered out.
        boolean isNotSampled = DataSampler.isNotSampled(modelConfig.isRegression(), super.tagSet, super.posTagSet, super.negTagSet, modelConfig.getNormalizeSampleRate(), modelConfig.isNormalizeSampleNegOnly(), rawTag);
        if (isNotSampled) {
            return null;
        }
    }
    // append tuple with tag, normalized value.
    Tuple tuple = TupleFactory.getInstance().newTuple();
    final NormType normType = modelConfig.getNormalizeType();
    Map<String, Object> compactVarMap = null;
    if (this.isCompactNorm) {
        compactVarMap = new HashMap<String, Object>();
    }
    if (!this.isForExpressions) {
        if (input.size() != this.columnConfigList.size()) {
            this.mismatchCnt++;
            log.error("the input size - " + input.size() + ", while column size - " + columnConfigList.size());
            this.mismatchCnt++;
            // this could make Shifu could skip some malformed data
            if (this.mismatchCnt > MAX_MISMATCH_CNT) {
                throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
            }
            return null;
        }
        for (int i = 0; i < input.size(); i++) {
            ColumnConfig config = columnConfigList.get(i);
            String val = (input.get(i) == null) ? "" : input.get(i).toString();
            // load variables for weight calculating.
            if (weightExpr != null) {
                weightContext.set(new NSColumn(config.getColumnName()).getSimpleName(), val);
            }
            // check tag type.
            if (tagColumnNum == i) {
                if (modelConfig.isRegression()) {
                    int type = 0;
                    if (super.posTagSet.contains(rawTag)) {
                        type = 1;
                    } else if (super.negTagSet.contains(rawTag)) {
                        type = 0;
                    } else {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
                    } else {
                        tuple.append(type);
                    }
                } else if (this.isLinearTarget) {
                    double tagValue = 0.0;
                    try {
                        tagValue = Double.parseDouble(rawTag);
                    } catch (Exception e) {
                        log.error("Tag - " + rawTag + " is invalid(not numerical). Skip record.");
                        // skip this line
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), tagValue);
                    } else {
                        tuple.append(tagValue);
                    }
                } else {
                    int index = -1;
                    for (int j = 0; j < tags.size(); j++) {
                        Set<String> tagSet = tags.get(j);
                        if (tagSet.contains(rawTag)) {
                            index = j;
                            break;
                        }
                    }
                    if (index == -1) {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
                    } else {
                        tuple.append(index);
                    }
                }
                continue;
            }
            if (this.isForClean) {
                // for RF/GBT model, only clean data, not real do norm data
                if (config.isCategorical()) {
                    Map<String, Integer> map = this.categoricalIndexMap.get(config.getColumnNum());
                    // map should not be null, no need check if map is null, if val not in binCategory, set it to ""
                    tuple.append(((map.get(val) == null || map.get(val) == -1)) ? "" : val);
                } else {
                    Double normVal = 0d;
                    try {
                        normVal = Double.parseDouble(val);
                    } catch (Exception e) {
                        log.debug("Not decimal format " + val + ", using default!");
                        normVal = Normalizer.defaultMissingValue(config);
                    }
                    appendOutputValue(tuple, normVal, true);
                }
            } else {
                if (this.isCompactNorm) {
                    // only output features and target, weight in compact norm mode
                    if (!config.isMeta() && config.isFinalSelect()) {
                        // for multiple classification, binPosRate means rate of such category over all counts,
                        // reuse binPosRate for normalize
                        List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                        for (Double normVal : normVals) {
                            String formatVal = getOutputValue(normVal, true);
                            compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
                        }
                    } else if (config.isMeta()) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
                    } else {
                    // if is compact mode but such column is not final selected, should be empty, as only append
                    // target and finalSelect feature, no need append here so this code block is empty. TODO, do
                    // we need meta column?
                    }
                } else {
                    // it will cause variable fail to normalize
                    if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
                        // for multiple classification, binPosRate means rate of such category over all counts,
                        // reuse binPosRate for normalize
                        List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                        for (Double normVal : normVals) {
                            appendOutputValue(tuple, normVal, true);
                        }
                    } else {
                        tuple.append(config.isMeta() ? val : null);
                    }
                }
            }
        }
    } else {
        // for segment expansion variables
        int rawSize = input.size();
        for (int i = 0; i < this.columnConfigList.size(); i++) {
            ColumnConfig config = this.columnConfigList.get(i);
            int newIndex = i >= rawSize ? i % rawSize : i;
            String val = (input.get(newIndex) == null) ? "" : input.get(newIndex).toString();
            // for target column
            if (config.isTarget()) {
                if (modelConfig.isRegression()) {
                    int type = 0;
                    if (super.posTagSet.contains(rawTag)) {
                        type = 1;
                    } else if (super.negTagSet.contains(rawTag)) {
                        type = 0;
                    } else {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
                    } else {
                        tuple.append(type);
                    }
                } else {
                    int index = -1;
                    for (int j = 0; j < tags.size(); j++) {
                        Set<String> tagSet = tags.get(j);
                        if (tagSet.contains(rawTag)) {
                            index = j;
                            break;
                        }
                    }
                    if (index == -1) {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
                    } else {
                        tuple.append(index);
                    }
                }
                continue;
            }
            if (this.isCompactNorm) {
                // only output features and target, weight in compact norm mode
                if (!config.isMeta() && config.isFinalSelect()) {
                    // for multiple classification, binPosRate means rate of such category over all counts,
                    // reuse binPosRate for normalize
                    List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                    for (Double normVal : normVals) {
                        String formatVal = getOutputValue(normVal, true);
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
                    }
                } else if (config.isMeta()) {
                    compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
                } else {
                // if is compact mode but such column is not final selected, should be empty, as only append
                // target and finalSelect feature, no need append here so this code block is empty. TODO, do
                // we need meta column?
                }
            } else {
                // for others
                if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
                    List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                    for (Double normVal : normVals) {
                        appendOutputValue(tuple, normVal, true);
                    }
                } else {
                    tuple.append(config.isMeta() ? val : null);
                }
            }
        }
    }
    // for compact norm mode, output to tuple at here
    if (this.isCompactNorm) {
        for (int i = 0; i < outputCompactColumns.size(); i++) {
            tuple.append(compactVarMap.get(outputCompactColumns.get(i)));
        }
    }
    // append tuple with weight.
    double weight = evaluateWeight(weightExpr, weightContext);
    tuple.append(weight);
    return tuple;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) NormType(ml.shifu.shifu.container.obj.ModelNormalizeConf.NormType) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException) ShifuException(ml.shifu.shifu.exception.ShifuException) Tuple(org.apache.pig.data.Tuple) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 53 with ColumnConfig

use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.

the class CommonUtils method loadColumnConfigList.

/**
 * Load column configuration list.
 *
 * @param path
 *            file path
 * @param sourceType
 *            source type: hdfs or local
 * @param nullSampleValues
 *            if sample values null or not to save memory especially in Pig UDF to save more memory. there is a OOM
 *            if larger ColumnConfig.json.
 * @return column config list
 * @throws IOException
 *             if any IO exception in parsing json.
 * @throws IllegalArgumentException
 *             if {@code path} is null or empty, if sourceType is null.
 */
public static List<ColumnConfig> loadColumnConfigList(String path, SourceType sourceType, boolean nullSampleValues) throws IOException {
    ColumnConfig[] configList = loadJSON(path, sourceType, ColumnConfig[].class);
    List<ColumnConfig> columnConfigList = new ArrayList<ColumnConfig>();
    for (ColumnConfig columnConfig : configList) {
        // reset sample values to null to save memory
        if (nullSampleValues) {
            columnConfig.setSampleValues(null);
        }
        // construct Category Index map for fast query.
        if (columnConfig.isCategorical() && columnConfig.getColumnBinning() != null && columnConfig.getColumnBinning().getBinCategory() != null) {
            List<String> categories = columnConfig.getColumnBinning().getBinCategory();
            Map<String, Integer> categoryIndexMapping = new HashMap<String, Integer>();
            for (int i = 0; i < categories.size(); i++) {
                String category = categories.get(i);
                if (category.contains(Constants.CATEGORICAL_GROUP_VAL_DELIMITER)) {
                    // merged category should be flatten, use split function this class to avoid depending on guava
                    String[] splits = ml.shifu.shifu.core.dtrain.StringUtils.split(category, Constants.CATEGORICAL_GROUP_VAL_DELIMITER);
                    for (String str : splits) {
                        categoryIndexMapping.put(str, i);
                    }
                } else {
                    categoryIndexMapping.put(category, i);
                }
            }
            columnConfig.getColumnBinning().setBinCateMap(categoryIndexMapping);
        }
        columnConfigList.add(columnConfig);
    }
    return columnConfigList;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig)

Example 54 with ColumnConfig

use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.

the class BinningDataMergeUDF method exec.

/*
     * (non-Javadoc)
     * 
     * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
     */
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null) {
        return null;
    }
    Integer columnId = (Integer) input.get(0);
    DataBag databag = (DataBag) input.get(1);
    int corrColumnId = columnId;
    if (corrColumnId >= super.columnConfigList.size()) {
        corrColumnId = corrColumnId % super.columnConfigList.size();
    }
    ColumnConfig columnConfig = super.columnConfigList.get(corrColumnId);
    AbstractBinning<?> binning = null;
    AbstractBinning<?> backupBinning = null;
    log.info("Start merging bin info for columnId - " + columnId + ", the bag size is - " + databag.size());
    Iterator<Tuple> iterator = databag.iterator();
    while (iterator.hasNext()) {
        Tuple element = iterator.next();
        if (element == null || element.size() < 2) {
            continue;
        }
        String objValStr = (String) element.get(1);
        String hybridCateValStr = null;
        long start = System.currentTimeMillis();
        // for hybrid, split
        if (columnConfig.isHybrid()) {
            String[] splits = CommonUtils.split(objValStr, Constants.HYBRID_BIN_STR_DILIMETER);
            objValStr = splits[0];
            hybridCateValStr = splits[1];
        }
        AbstractBinning<?> partialBinning = AbstractBinning.constructBinningFromStr(modelConfig, columnConfig, objValStr);
        AbstractBinning<?> partialBackupBinning = null;
        if (columnConfig.isHybrid()) {
            partialBackupBinning = new CategoricalBinning();
            partialBackupBinning.stringToObj(hybridCateValStr);
        }
        log.info("constructBinningFromStr: " + (System.currentTimeMillis() - start) + "ms");
        start = System.currentTimeMillis();
        if (binning == null) {
            binning = partialBinning;
            if (columnConfig.isHybrid()) {
                backupBinning = partialBackupBinning;
            }
        } else {
            binning.mergeBin(partialBinning);
            if (columnConfig.isHybrid()) {
                backupBinning.mergeBin(partialBackupBinning);
            }
        }
        log.info("mergeBin: " + (System.currentTimeMillis() - start) + "ms");
    }
    Tuple output = TupleFactory.getInstance().newTuple(2);
    output.set(0, columnId);
    List<?> binFields = binning.getDataBin();
    // it will consume too much memory when join them together, that will cause OOM exception
    if (columnConfig.isCategorical() && binFields.size() > this.maxCategorySize) {
        log.warn(columnId + " " + columnConfig.getColumnName() + " is over maximal categorical size: " + this.maxCategorySize);
        output.set(1, "");
    } else {
        if (columnConfig.isHybrid()) {
            String finalBinStr = StringUtils.join(binFields, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR);
            finalBinStr += Constants.HYBRID_BIN_STR_DILIMETER + StringUtils.join(backupBinning.getDataBin(), CalculateStatsUDF.CATEGORY_VAL_SEPARATOR);
            output.set(1, finalBinStr);
        } else {
            output.set(1, StringUtils.join(binFields, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR));
        }
    }
    log.info("Finish merging bin info for columnId - " + columnId);
    return output;
}
Also used : DataBag(org.apache.pig.data.DataBag) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) CategoricalBinning(ml.shifu.shifu.core.binning.CategoricalBinning) Tuple(org.apache.pig.data.Tuple)

Example 55 with ColumnConfig

use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.

the class BinningDataUDF method exec.

/*
     * (non-Javadoc)
     * 
     * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
     */
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() < 2) {
        return null;
    }
    Integer columnId = (Integer) input.get(0);
    DataBag databag = (DataBag) input.get(1);
    ColumnConfig columnConfig = super.columnConfigList.get(columnId);
    AbstractBinning<?> binning = null;
    if (columnConfig.isCategorical()) {
        binning = new CategoricalBinning(-1, super.modelConfig.getMissingOrInvalidValues(), this.maxCategorySize);
    } else {
        if (super.modelConfig.getBinningMethod().equals(BinningMethod.EqualInterval)) {
            binning = new EqualIntervalBinning(modelConfig.getStats().getMaxNumBin());
        } else {
            switch(this.modelConfig.getBinningAlgorithm()) {
                case Native:
                    log.info("Invoke Native binning method, memory cosuming!!");
                    // always merge bins
                    binning = new NativeBinning(modelConfig.getStats().getMaxNumBin(), true);
                    break;
                case SPDT:
                case SPDTI:
                    log.info("Invoke SPDT(Streaming Parallel Decision Tree) binning method, ");
                    binning = new EqualPopulationBinning(modelConfig.getStats().getMaxNumBin());
                    break;
                case MunroPat:
                case MunroPatI:
                    log.info("Invoke Munro & Paterson selecting algorithm");
                    binning = new MunroPatBinning(modelConfig.getStats().getMaxNumBin());
                    break;
                default:
                    log.info("Default: Invoke Munro & Paterson selecting algorithm");
                    binning = new MunroPatBinning(modelConfig.getStats().getMaxNumBin());
                    break;
            }
        }
    }
    Iterator<Tuple> iterator = databag.iterator();
    while (iterator.hasNext()) {
        Tuple element = iterator.next();
        if (element == null || element.size() < 2) {
            continue;
        }
        Object value = element.get(1);
        if (value != null) {
            binning.addData(value.toString());
        }
    }
    Tuple output = TupleFactory.getInstance().newTuple(2);
    output.set(0, columnId);
    // Do check here. It's because if there are too many value for categorical variable,
    // it will consume too much memory when join them together, that will cause OOM exception
    List<?> dataBin = binning.getDataBin();
    if (dataBin.size() > this.maxCategorySize) {
        output.set(1, "");
    } else {
        output.set(1, StringUtils.join(dataBin, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR));
    }
    log.info("Finish merging bin info for columnId - " + columnId);
    return output;
}
Also used : EqualIntervalBinning(ml.shifu.shifu.core.binning.EqualIntervalBinning) DataBag(org.apache.pig.data.DataBag) NativeBinning(ml.shifu.shifu.core.binning.NativeBinning) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) EqualPopulationBinning(ml.shifu.shifu.core.binning.EqualPopulationBinning) MunroPatBinning(ml.shifu.shifu.core.binning.MunroPatBinning) CategoricalBinning(ml.shifu.shifu.core.binning.CategoricalBinning) Tuple(org.apache.pig.data.Tuple)

Aggregations

ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)131 ArrayList (java.util.ArrayList)36 Test (org.testng.annotations.Test)17 IOException (java.io.IOException)16 HashMap (java.util.HashMap)12 Tuple (org.apache.pig.data.Tuple)10 File (java.io.File)8 NSColumn (ml.shifu.shifu.column.NSColumn)8 ModelConfig (ml.shifu.shifu.container.obj.ModelConfig)8 ShifuException (ml.shifu.shifu.exception.ShifuException)8 Path (org.apache.hadoop.fs.Path)8 List (java.util.List)7 Scanner (java.util.Scanner)7 DataBag (org.apache.pig.data.DataBag)7 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)5 BasicFloatNetwork (ml.shifu.shifu.core.dtrain.dataset.BasicFloatNetwork)5 TrainingDataSet (ml.shifu.shifu.core.dvarsel.dataset.TrainingDataSet)5 BasicMLData (org.encog.ml.data.basic.BasicMLData)5 BufferedWriter (java.io.BufferedWriter)3 FileInputStream (java.io.FileInputStream)3