use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.
the class BinaryWDLSerializer method save.
public static void save(ModelConfig modelConfig, List<ColumnConfig> columnConfigList, WideAndDeep wideAndDeep, FileSystem fs, Path output) throws IOException {
DataOutputStream fos = null;
try {
fos = new DataOutputStream(new GZIPOutputStream(fs.create(output)));
// version
fos.writeInt(CommonConstants.WDL_FORMAT_VERSION);
// Reserved two float field, one double field and one string field
fos.writeFloat(0.0f);
fos.writeFloat(0.0f);
fos.writeDouble(0.0d);
fos.writeUTF("Reserved field");
// write normStr
String normStr = modelConfig.getNormalize().getNormType().toString();
StringUtils.writeString(fos, normStr);
// compute columns needed
Map<Integer, String> columnIndexNameMapping = getIndexNameMapping(columnConfigList);
// write column stats to output
List<NNColumnStats> csList = new ArrayList<>();
for (ColumnConfig cc : columnConfigList) {
if (columnIndexNameMapping.containsKey(cc.getColumnNum())) {
NNColumnStats cs = new NNColumnStats();
cs.setCutoff(modelConfig.getNormalizeStdDevCutOff());
cs.setColumnType(cc.getColumnType());
cs.setMean(cc.getMean());
cs.setStddev(cc.getStdDev());
cs.setColumnNum(cc.getColumnNum());
cs.setColumnName(cc.getColumnName());
cs.setBinCategories(cc.getBinCategory());
cs.setBinBoundaries(cc.getBinBoundary());
cs.setBinPosRates(cc.getBinPosRate());
cs.setBinCountWoes(cc.getBinCountWoe());
cs.setBinWeightWoes(cc.getBinWeightedWoe());
// TODO cache such computation
double[] meanAndStdDev = Normalizer.calculateWoeMeanAndStdDev(cc, false);
cs.setWoeMean(meanAndStdDev[0]);
cs.setWoeStddev(meanAndStdDev[1]);
double[] weightMeanAndStdDev = Normalizer.calculateWoeMeanAndStdDev(cc, true);
cs.setWoeWgtMean(weightMeanAndStdDev[0]);
cs.setWoeWgtStddev(weightMeanAndStdDev[1]);
csList.add(cs);
}
}
fos.writeInt(csList.size());
for (NNColumnStats cs : csList) {
cs.write(fos);
}
// persist WideAndDeep Model
wideAndDeep.write(fos);
} finally {
IOUtils.closeStream(fos);
}
}
use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.
the class CalculateNewStatsUDF method exec.
/*
* (non-Javadoc)
*
* @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
*/
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null) {
return null;
}
Integer columnId = (Integer) input.get(0);
DataBag databag = (DataBag) input.get(1);
String binningDataInfo = (String) input.get(3);
log.info("start to process column id - " + columnId.toString());
ColumnConfig columnConfig = super.columnConfigList.get(columnId);
AbstractVarStats varstats = AbstractVarStats.getVarStatsInst(modelConfig, columnConfig, valueThreshold);
varstats.runVarStats(binningDataInfo, databag);
log.info("after to process column id - " + columnId.toString());
ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(columnConfig.getBinCountNeg(), columnConfig.getBinCountPos());
ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(columnConfig.getBinWeightedNeg(), columnConfig.getBinWeightedPos());
// Assemble the results
Tuple tuple = TupleFactory.getInstance().newTuple();
tuple.append(columnId);
if (columnConfig.isCategorical()) {
if (columnConfig.getBinCategory().size() == 0 || columnConfig.getBinCategory().size() > this.maxCategorySize) {
return null;
}
String binCategory = "[" + StringUtils.join(columnConfig.getBinCategory(), CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]";
tuple.append(Base64Utils.base64Encode(binCategory));
} else {
if (columnConfig.getBinBoundary().size() == 1) {
return null;
}
tuple.append(columnConfig.getBinBoundary().toString());
}
tuple.append(columnConfig.getBinCountNeg().toString());
tuple.append(columnConfig.getBinCountPos().toString());
tuple.append(columnConfig.getBinAvgScore().toString());
tuple.append(columnConfig.getBinPosRate().toString());
tuple.append(df.format(columnCountMetrics.getKs()));
tuple.append(df.format(columnCountMetrics.getIv()));
tuple.append(df.format(columnConfig.getColumnStats().getMax()));
tuple.append(df.format(columnConfig.getColumnStats().getMin()));
tuple.append(df.format(columnConfig.getColumnStats().getMean()));
tuple.append(df.format(columnConfig.getColumnStats().getStdDev()));
if (columnConfig.isCategorical()) {
tuple.append("C");
} else {
tuple.append("N");
}
tuple.append(df.format(columnConfig.getColumnStats().getMedian()));
tuple.append(columnConfig.getMissingCount());
tuple.append(columnConfig.getTotalCount());
tuple.append(df.format(columnConfig.getMissingPercentage()));
tuple.append(columnConfig.getBinWeightedNeg().toString());
tuple.append(columnConfig.getBinWeightedPos().toString());
tuple.append(columnCountMetrics.getWoe());
tuple.append(columnWeightMetrics.getWoe());
tuple.append(df.format(columnWeightMetrics.getKs()));
tuple.append(df.format(columnWeightMetrics.getIv()));
tuple.append(columnCountMetrics.getBinningWoe().toString());
tuple.append(columnWeightMetrics.getBinningWoe().toString());
tuple.append(columnConfig.getColumnStats().getSkewness());
tuple.append(columnConfig.getColumnStats().getKurtosis());
return tuple;
}
use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.
the class NormalUtils method getAllFeatureList.
/**
* Get all available feature ids from ColumnConfig list.
* There are two situations for this:
* 1) when training model, get all available features before start
* 2) get all available features before doing variable selection
*
* @param columnConfigList - ColumnConfig list to check
* @param isAfterVarSelect - true for training, false for variable selection
* @return - available feature list
*/
public static List<Integer> getAllFeatureList(List<ColumnConfig> columnConfigList, boolean isAfterVarSelect) {
boolean hasCandidate = CommonUtils.hasCandidateColumns(columnConfigList);
List<Integer> features = new ArrayList<Integer>();
List<String> wrongFeatures = new ArrayList<String>();
for (ColumnConfig config : columnConfigList) {
if (isAfterVarSelect) {
if (config.isFinalSelect() && !config.isTarget() && !config.isMeta()) {
// or categorical feature with getBinCategory().size() larger than 0
if ((config.isNumerical() && config.getBinBoundary() != null && config.getBinBoundary().size() > 0) || (config.isCategorical() && config.getBinCategory() != null && config.getBinCategory().size() > 0)) {
features.add(config.getColumnNum());
} else if ((config.isNumerical() && (config.getBinBoundary() == null || config.getBinBoundary().size() <= 0)) || (config.isCategorical() && (config.getBinCategory() == null || config.getBinCategory().size() <= 0))) {
wrongFeatures.add(config.getColumnName());
}
}
} else {
if (!config.isMeta() && !config.isTarget() && CommonUtils.isGoodCandidate(config, hasCandidate)) {
// or categorical feature with getBinCategory().size() larger than 0
if ((config.isNumerical() && config.getBinBoundary() != null && config.getBinBoundary().size() > 0) || (config.isCategorical() && config.getBinCategory() != null && config.getBinCategory().size() > 0)) {
features.add(config.getColumnNum());
} else if ((config.isNumerical() && (config.getBinBoundary() == null || config.getBinBoundary().size() <= 0)) || (config.isCategorical() && (config.getBinCategory() == null || config.getBinCategory().size() <= 0))) {
wrongFeatures.add(config.getColumnName());
}
}
}
}
if (!wrongFeatures.isEmpty()) {
throw new IllegalStateException("Some columns config should not be selected due to bin issue: " + wrongFeatures.toString());
}
return features;
}
use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.
the class EvalNormUDF method outputSchema.
/**
* output the schema for evaluation score
*/
public Schema outputSchema(Schema input) {
try {
Schema tupleSchema = new Schema();
for (int i = 0; i < this.outputNames.size(); i++) {
String name = this.outputNames.get(i);
name = normColumnName(name);
if (i < 2 + validMetaSize) {
// set target, weight and meta columns to string
tupleSchema.add(new FieldSchema(name, DataType.CHARARRAY));
} else {
if (this.isOutputRaw) {
ColumnConfig columnConfig = this.columnConfigMap.get(name);
if (columnConfig.isNumerical()) {
tupleSchema.add(new FieldSchema(name + ORIG_POSTFIX, getOutputType()));
} else {
tupleSchema.add(new FieldSchema(name + ORIG_POSTFIX, DataType.CHARARRAY));
}
}
tupleSchema.add(new FieldSchema(name, getOutputType()));
}
}
if (this.isAppendScore) {
tupleSchema.add(new FieldSchema(StringUtils.isBlank(this.scoreName) ? "default_score" : this.scoreName, DataType.DOUBLE));
}
return new Schema(new FieldSchema("EvalNorm", tupleSchema, DataType.TUPLE));
} catch (IOException e) {
log.error("Error in outputSchema", e);
return null;
}
}
use of ml.shifu.shifu.container.obj.ColumnConfig in project shifu by ShifuML.
the class FilterBinningDataUDF method exec.
/*
* (non-Javadoc)
*
* @see org.apache.pig.EvalFunc#exec(org.apache.pig.data.Tuple)
*/
@Override
public Boolean exec(Tuple input) throws IOException {
Integer columnNum = (Integer) input.get(0);
if (columnNum == null) {
return false;
}
ColumnConfig columnConfig = columnConfigList.get(columnNum);
boolean isPositive = (Boolean) input.get(2);
if (isValidRecord(modelConfig.isRegression(), isPositive, columnConfig)) {
return true;
}
return false;
}
Aggregations