Search in sources :

Example 46 with ColumnConfig

use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.

the class BinaryWDLSerializer method save.

public static void save(ModelConfig modelConfig, List<ColumnConfig> columnConfigList, WideAndDeep wideAndDeep, FileSystem fs, Path output) throws IOException {
    DataOutputStream fos = null;
    try {
        fos = new DataOutputStream(new GZIPOutputStream(fs.create(output)));
        // version
        fos.writeInt(CommonConstants.WDL_FORMAT_VERSION);
        // Reserved two float field, one double field and one string field
        fos.writeFloat(0.0f);
        fos.writeFloat(0.0f);
        fos.writeDouble(0.0d);
        fos.writeUTF("Reserved field");
        // write normStr
        String normStr = modelConfig.getNormalize().getNormType().toString();
        StringUtils.writeString(fos, normStr);
        // compute columns needed
        Map<Integer, String> columnIndexNameMapping = getIndexNameMapping(columnConfigList);
        // write column stats to output
        List<NNColumnStats> csList = new ArrayList<>();
        for (ColumnConfig cc : columnConfigList) {
            if (columnIndexNameMapping.containsKey(cc.getColumnNum())) {
                NNColumnStats cs = new NNColumnStats();
                cs.setCutoff(modelConfig.getNormalizeStdDevCutOff());
                cs.setColumnType(cc.getColumnType());
                cs.setMean(cc.getMean());
                cs.setStddev(cc.getStdDev());
                cs.setColumnNum(cc.getColumnNum());
                cs.setColumnName(cc.getColumnName());
                cs.setBinCategories(cc.getBinCategory());
                cs.setBinBoundaries(cc.getBinBoundary());
                cs.setBinPosRates(cc.getBinPosRate());
                cs.setBinCountWoes(cc.getBinCountWoe());
                cs.setBinWeightWoes(cc.getBinWeightedWoe());
                // TODO cache such computation
                double[] meanAndStdDev = Normalizer.calculateWoeMeanAndStdDev(cc, false);
                cs.setWoeMean(meanAndStdDev[0]);
                cs.setWoeStddev(meanAndStdDev[1]);
                double[] weightMeanAndStdDev = Normalizer.calculateWoeMeanAndStdDev(cc, true);
                cs.setWoeWgtMean(weightMeanAndStdDev[0]);
                cs.setWoeWgtStddev(weightMeanAndStdDev[1]);
                csList.add(cs);
            }
        }
        fos.writeInt(csList.size());
        for (NNColumnStats cs : csList) {
            cs.write(fos);
        }
        // persist WideAndDeep Model
        wideAndDeep.write(fos);
    } finally {
        IOUtils.closeStream(fos);
    }
}
Also used : NNColumnStats(ml.shifu.shifu.core.dtrain.nn.NNColumnStats) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) GZIPOutputStream(java.util.zip.GZIPOutputStream) DataOutputStream(java.io.DataOutputStream) ArrayList(java.util.ArrayList)

Example 47 with ColumnConfig

use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.

the class CalculateNewStatsUDF method exec.

/*
     * (non-Javadoc)
     * 
     * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
     */
@Override
public Tuple exec(Tuple input) throws IOException {
    if (input == null) {
        return null;
    }
    Integer columnId = (Integer) input.get(0);
    DataBag databag = (DataBag) input.get(1);
    String binningDataInfo = (String) input.get(3);
    log.info("start to process column id - " + columnId.toString());
    ColumnConfig columnConfig = super.columnConfigList.get(columnId);
    AbstractVarStats varstats = AbstractVarStats.getVarStatsInst(modelConfig, columnConfig, valueThreshold);
    varstats.runVarStats(binningDataInfo, databag);
    log.info("after to process column id - " + columnId.toString());
    ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(columnConfig.getBinCountNeg(), columnConfig.getBinCountPos());
    ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(columnConfig.getBinWeightedNeg(), columnConfig.getBinWeightedPos());
    // Assemble the results
    Tuple tuple = TupleFactory.getInstance().newTuple();
    tuple.append(columnId);
    if (columnConfig.isCategorical()) {
        if (columnConfig.getBinCategory().size() == 0 || columnConfig.getBinCategory().size() > this.maxCategorySize) {
            return null;
        }
        String binCategory = "[" + StringUtils.join(columnConfig.getBinCategory(), CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]";
        tuple.append(Base64Utils.base64Encode(binCategory));
    } else {
        if (columnConfig.getBinBoundary().size() == 1) {
            return null;
        }
        tuple.append(columnConfig.getBinBoundary().toString());
    }
    tuple.append(columnConfig.getBinCountNeg().toString());
    tuple.append(columnConfig.getBinCountPos().toString());
    tuple.append(columnConfig.getBinAvgScore().toString());
    tuple.append(columnConfig.getBinPosRate().toString());
    tuple.append(df.format(columnCountMetrics.getKs()));
    tuple.append(df.format(columnCountMetrics.getIv()));
    tuple.append(df.format(columnConfig.getColumnStats().getMax()));
    tuple.append(df.format(columnConfig.getColumnStats().getMin()));
    tuple.append(df.format(columnConfig.getColumnStats().getMean()));
    tuple.append(df.format(columnConfig.getColumnStats().getStdDev()));
    if (columnConfig.isCategorical()) {
        tuple.append("C");
    } else {
        tuple.append("N");
    }
    tuple.append(df.format(columnConfig.getColumnStats().getMedian()));
    tuple.append(columnConfig.getMissingCount());
    tuple.append(columnConfig.getTotalCount());
    tuple.append(df.format(columnConfig.getMissingPercentage()));
    tuple.append(columnConfig.getBinWeightedNeg().toString());
    tuple.append(columnConfig.getBinWeightedPos().toString());
    tuple.append(columnCountMetrics.getWoe());
    tuple.append(columnWeightMetrics.getWoe());
    tuple.append(df.format(columnWeightMetrics.getKs()));
    tuple.append(df.format(columnWeightMetrics.getIv()));
    tuple.append(columnCountMetrics.getBinningWoe().toString());
    tuple.append(columnWeightMetrics.getBinningWoe().toString());
    tuple.append(columnConfig.getColumnStats().getSkewness());
    tuple.append(columnConfig.getColumnStats().getKurtosis());
    return tuple;
}
Also used : DataBag(org.apache.pig.data.DataBag) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) AbstractVarStats(ml.shifu.shifu.udf.stats.AbstractVarStats) ColumnMetrics(ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics) Tuple(org.apache.pig.data.Tuple)

Example 48 with ColumnConfig

use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.

the class NormalUtils method getAllFeatureList.

/**
 * Get all available feature ids from ColumnConfig list.
 * There are two situations for this:
 * 1) when training model, get all available features before start
 * 2) get all available features before doing variable selection
 *
 * @param columnConfigList - ColumnConfig list to check
 * @param isAfterVarSelect - true for training, false for variable selection
 * @return - available feature list
 */
public static List<Integer> getAllFeatureList(List<ColumnConfig> columnConfigList, boolean isAfterVarSelect) {
    boolean hasCandidate = CommonUtils.hasCandidateColumns(columnConfigList);
    List<Integer> features = new ArrayList<Integer>();
    List<String> wrongFeatures = new ArrayList<String>();
    for (ColumnConfig config : columnConfigList) {
        if (isAfterVarSelect) {
            if (config.isFinalSelect() && !config.isTarget() && !config.isMeta()) {
                // or categorical feature with getBinCategory().size() larger than 0
                if ((config.isNumerical() && config.getBinBoundary() != null && config.getBinBoundary().size() > 0) || (config.isCategorical() && config.getBinCategory() != null && config.getBinCategory().size() > 0)) {
                    features.add(config.getColumnNum());
                } else if ((config.isNumerical() && (config.getBinBoundary() == null || config.getBinBoundary().size() <= 0)) || (config.isCategorical() && (config.getBinCategory() == null || config.getBinCategory().size() <= 0))) {
                    wrongFeatures.add(config.getColumnName());
                }
            }
        } else {
            if (!config.isMeta() && !config.isTarget() && CommonUtils.isGoodCandidate(config, hasCandidate)) {
                // or categorical feature with getBinCategory().size() larger than 0
                if ((config.isNumerical() && config.getBinBoundary() != null && config.getBinBoundary().size() > 0) || (config.isCategorical() && config.getBinCategory() != null && config.getBinCategory().size() > 0)) {
                    features.add(config.getColumnNum());
                } else if ((config.isNumerical() && (config.getBinBoundary() == null || config.getBinBoundary().size() <= 0)) || (config.isCategorical() && (config.getBinCategory() == null || config.getBinCategory().size() <= 0))) {
                    wrongFeatures.add(config.getColumnName());
                }
            }
        }
    }
    if (!wrongFeatures.isEmpty()) {
        throw new IllegalStateException("Some columns config should not be selected due to bin issue: " + wrongFeatures.toString());
    }
    return features;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig)

Example 49 with ColumnConfig

use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.

the class EvalNormUDF method outputSchema.

/**
 * output the schema for evaluation score
 */
public Schema outputSchema(Schema input) {
    try {
        Schema tupleSchema = new Schema();
        for (int i = 0; i < this.outputNames.size(); i++) {
            String name = this.outputNames.get(i);
            name = normColumnName(name);
            if (i < 2 + validMetaSize) {
                // set target, weight and meta columns to string
                tupleSchema.add(new FieldSchema(name, DataType.CHARARRAY));
            } else {
                if (this.isOutputRaw) {
                    ColumnConfig columnConfig = this.columnConfigMap.get(name);
                    if (columnConfig.isNumerical()) {
                        tupleSchema.add(new FieldSchema(name + ORIG_POSTFIX, getOutputType()));
                    } else {
                        tupleSchema.add(new FieldSchema(name + ORIG_POSTFIX, DataType.CHARARRAY));
                    }
                }
                tupleSchema.add(new FieldSchema(name, getOutputType()));
            }
        }
        if (this.isAppendScore) {
            tupleSchema.add(new FieldSchema(StringUtils.isBlank(this.scoreName) ? "default_score" : this.scoreName, DataType.DOUBLE));
        }
        return new Schema(new FieldSchema("EvalNorm", tupleSchema, DataType.TUPLE));
    } catch (IOException e) {
        log.error("Error in outputSchema", e);
        return null;
    }
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) Schema(org.apache.pig.impl.logicalLayer.schema.Schema) FieldSchema(org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema) FieldSchema(org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema) IOException(java.io.IOException)

Example 50 with ColumnConfig

use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.

the class FilterBinningDataUDF method exec.

/*
     * (non-Javadoc)
     * 
     * @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
     */
@Override
public Boolean exec(Tuple input) throws IOException {
    Integer columnNum = (Integer) input.get(0);
    if (columnNum == null) {
        return false;
    }
    ColumnConfig columnConfig = columnConfigList.get(columnNum);
    boolean isPositive = (Boolean) input.get(2);
    if (isValidRecord(modelConfig.isRegression(), isPositive, columnConfig)) {
        return true;
    }
    return false;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig)

Aggregations

ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)131 ArrayList (java.util.ArrayList)36 Test (org.testng.annotations.Test)17 IOException (java.io.IOException)16 HashMap (java.util.HashMap)12 Tuple (org.apache.pig.data.Tuple)10 File (java.io.File)8 NSColumn (ml.shifu.shifu.column.NSColumn)8 ModelConfig (ml.shifu.shifu.container.obj.ModelConfig)8 ShifuException (ml.shifu.shifu.exception.ShifuException)8 Path (org.apache.hadoop.fs.Path)8 List (java.util.List)7 Scanner (java.util.Scanner)7 DataBag (org.apache.pig.data.DataBag)7 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)5 BasicFloatNetwork (ml.shifu.shifu.core.dtrain.dataset.BasicFloatNetwork)5 TrainingDataSet (ml.shifu.shifu.core.dvarsel.dataset.TrainingDataSet)5 BasicMLData (org.encog.ml.data.basic.BasicMLData)5 BufferedWriter (java.io.BufferedWriter)3 FileInputStream (java.io.FileInputStream)3