Examples with DataPurifier - ml.shifu.shifu.core.DataPurifier

Example 1 with DataPurifier

use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.

the class AddColumnNumAndFilterUDF method exec.

@SuppressWarnings("deprecation")
@Override
public DataBag exec(Tuple input) throws IOException {
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    TupleFactory tupleFactory = TupleFactory.getInstance();
    if (input == null) {
        return null;
    }
    int size = input.size();
    if (size == 0 || input.size() != this.columnConfigList.size()) {
        log.error("the input size - " + input.size() + ", while column size - " + columnConfigList.size());
        this.mismatchCnt++;
        // this could make Shifu could skip some malformed data
        if (this.mismatchCnt > MAX_MISMATCH_CNT) {
            throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
        }
        return null;
    }
    if (input.get(tagColumnNum) == null) {
        log.error("tagColumnNum is " + tagColumnNum + "; input size is " + input.size() + "; columnConfigList.size() is " + columnConfigList.size() + "; tuple is" + input.toDelimitedString("|") + "; tag is " + input.get(tagColumnNum));
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    String tag = CommonUtils.trimTag(input.get(tagColumnNum).toString());
    if (this.isLinearTarget) {
        if (!NumberUtils.isNumber(tag)) {
            if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
                PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
            }
            return null;
        }
    } else if (!super.tagSet.contains(tag)) {
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    Double rate = modelConfig.getBinningSampleRate();
    if (!this.isLinearTarget && !modelConfig.isClassification() && modelConfig.isBinningSampleNegOnly()) {
        if (super.negTagSet.contains(tag) && random.nextDouble() > rate) {
            return null;
        }
    } else {
        if (random.nextDouble() > rate) {
            return null;
        }
    }
    List<Boolean> filterResultList = null;
    if (this.isForExpressions) {
        filterResultList = new ArrayList<Boolean>();
        for (int j = 0; j < this.dataPurifiers.size(); j++) {
            DataPurifier dataPurifier = this.dataPurifiers.get(j);
            filterResultList.add(dataPurifier.isFilter(input));
        }
    }
    boolean isPositiveInst = (modelConfig.isRegression() && super.posTagSet.contains(tag));
    for (int i = 0; i < size; i++) {
        ColumnConfig config = columnConfigList.get(i);
        if (!isValidRecord(modelConfig.isRegression(), isPositiveInst, config)) {
            continue;
        }
        bag.add(buildTuple(input, tupleFactory, tag, i, i));
        if (this.isForExpressions) {
            for (int j = 0; j < this.dataPurifiers.size(); j++) {
                Boolean isFilter = filterResultList.get(j);
                if (isFilter != null && isFilter) {
                    bag.add(buildTuple(input, tupleFactory, tag, i, (j + 1) * size + i));
                }
            }
        }
    }
    return bag;
}

Also used : DataPurifier(ml.shifu.shifu.core.DataPurifier) DataBag(org.apache.pig.data.DataBag) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) TupleFactory(org.apache.pig.data.TupleFactory) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 2 with DataPurifier

use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.

the class UpdateBinningInfoMapper method map.

/**
 * Mapper implementation includes: 1. Invalid data purifier 2. Column statistics update.
 */
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String valueStr = value.toString();
    if (valueStr == null || valueStr.length() == 0 || valueStr.trim().length() == 0) {
        LOG.warn("Empty input.");
        return;
    }
    context.getCounter(Constants.SHIFU_GROUP_COUNTER, "TOTAL_VALID_COUNT").increment(1L);
    if (!this.dataPurifier.isFilter(valueStr)) {
        context.getCounter(Constants.SHIFU_GROUP_COUNTER, "FILTER_OUT_COUNT").increment(1L);
        return;
    }
    String[] units = CommonUtils.split(valueStr, this.dataSetDelimiter);
    // tagColumnNum should be in units array, if not IndexOutofBoundException
    if (units.length != this.columnConfigList.size()) {
        LOG.error("Data column length doesn't match with ColumnConfig size. Just skip.");
        return;
    }
    String tag = CommonUtils.trimTag(units[this.tagColumnNum]);
    if (modelConfig.isRegression()) {
        if (tag == null || (!posTags.contains(tag) && !negTags.contains(tag))) {
            context.getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1L);
            return;
        }
    } else {
        if (tag == null || (!isLinearTarget && !tags.contains(tag))) {
            context.getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1L);
            return;
        }
    }
    Double weight = 1.0;
    try {
        weight = (this.weightedColumnNum == -1 ? 1.0d : Double.valueOf(units[this.weightedColumnNum]));
        if (weight < 0) {
            weightExceptions += 1;
            context.getCounter(Constants.SHIFU_GROUP_COUNTER, "WEIGHT_EXCEPTION").increment(1L);
            if (weightExceptions > 5000 && this.isThrowforWeightException) {
                throw new IllegalStateException("Please check weight column in eval, exceptional weight count is over 5000");
            }
        }
    } catch (NumberFormatException e) {
        weightExceptions += 1;
        context.getCounter(Constants.SHIFU_GROUP_COUNTER, "WEIGHT_EXCEPTION").increment(1L);
        if (weightExceptions > 5000 && this.isThrowforWeightException) {
            throw new IllegalStateException("Please check weight column in eval, exceptional weight count is over 5000");
        }
    }
    List<Boolean> filterResults = null;
    if (this.isForExpressions) {
        filterResults = new ArrayList<Boolean>();
        for (int j = 0; j < this.expressionDataPurifiers.size(); j++) {
            DataPurifier dp = this.expressionDataPurifiers.get(j);
            filterResults.add(dp.isFilter(valueStr));
        }
    }
    // valid data process
    for (int i = 0; i < units.length; i++) {
        populateStats(units, tag, weight, i, i);
        if (this.isForExpressions) {
            for (int j = 0; j < this.expressionDataPurifiers.size(); j++) {
                Boolean filter = filterResults.get(j);
                if (filter != null && filter) {
                    populateStats(units, tag, weight, i, (j + 1) * units.length + i);
                }
            }
        }
    }
}

Also used : DataPurifier(ml.shifu.shifu.core.DataPurifier)

Example 3 with DataPurifier

use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.

the class ShifuTestProcessor method runFilterTest.

private int runFilterTest(EvalConfig evalConfig) throws IOException {
    RawSourceData dataset = evalConfig.getDataSet();
    if (StringUtils.isBlank(dataset.getFilterExpressions())) {
        LOG.warn("No filter expression set in eval-{} dataset. Skip it!", evalConfig.getName());
        return 0;
    }
    LOG.info("Start to test the filter against eval `{}` dataset.", evalConfig.getName());
    DataPurifier dataPurifier = new DataPurifier(evalConfig);
    return doFilterTest(dataPurifier, dataset.getDataPath(), dataset.getSource());
}

Also used : DataPurifier(ml.shifu.shifu.core.DataPurifier) RawSourceData(ml.shifu.shifu.container.obj.RawSourceData)

Example 4 with DataPurifier

use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.

the class PostTrainMapper method setup.

@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    loadConfigFiles(context);
    loadTagWeightNum();
    this.dataPurifier = new DataPurifier(this.modelConfig, false);
    this.outputKey = new IntWritable();
    this.outputValue = new Text();
    this.tags = new HashSet<String>(modelConfig.getFlattenTags());
    SourceType sourceType = this.modelConfig.getDataSet().getSource();
    List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, null, sourceType);
    this.headers = CommonUtils.getFinalHeaders(modelConfig);
    this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, modelConfig.getDataSetDelimiter(), models);
    this.mos = new MultipleOutputs<NullWritable, Text>((TaskInputOutputContext) context);
    this.initFeatureStats();
}

Also used : DataPurifier(ml.shifu.shifu.core.DataPurifier) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) TaskInputOutputContext(org.apache.hadoop.mapreduce.TaskInputOutputContext) Text(org.apache.hadoop.io.Text) BasicML(org.encog.ml.BasicML) NullWritable(org.apache.hadoop.io.NullWritable) IntWritable(org.apache.hadoop.io.IntWritable) ModelRunner(ml.shifu.shifu.core.ModelRunner)

Example 5 with DataPurifier

use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.

the class FeatureImportanceMapper method setup.

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    loadConfigFiles(context);
    loadTagWeightNum();
    this.dataPurifier = new DataPurifier(this.modelConfig, false);
    this.outputKey = new IntWritable();
    this.outputValue = new DoubleWritable();
    this.tags = new HashSet<String>(modelConfig.getFlattenTags());
    this.headers = CommonUtils.getFinalHeaders(modelConfig);
    this.initFeatureStats();
}

Also used : DataPurifier(ml.shifu.shifu.core.DataPurifier) DoubleWritable(org.apache.hadoop.io.DoubleWritable) IntWritable(org.apache.hadoop.io.IntWritable)

Aggregations

DataPurifier (ml.shifu.shifu.core.DataPurifier)11 IntWritable (org.apache.hadoop.io.IntWritable)5 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)4 HashMap (java.util.HashMap)3 RawSourceData (ml.shifu.shifu.container.obj.RawSourceData)2 IOException (java.io.IOException)1 InvocationTargetException (java.lang.reflect.InvocationTargetException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 Properties (java.util.Properties)1 ModelConfig (ml.shifu.shifu.container.obj.ModelConfig)1 ModelSourceDataConf (ml.shifu.shifu.container.obj.ModelSourceDataConf)1 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)1 ModelRunner (ml.shifu.shifu.core.ModelRunner)1 CountAndFrequentItems (ml.shifu.shifu.core.autotype.AutoTypeDistinctCountMapper.CountAndFrequentItems)1 TrainingDataSet (ml.shifu.shifu.core.dvarsel.dataset.TrainingDataSet)1 ShifuException (ml.shifu.shifu.exception.ShifuException)1 DoubleWritable (org.apache.hadoop.io.DoubleWritable)1 NullWritable (org.apache.hadoop.io.NullWritable)1