use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.
the class AddColumnNumAndFilterUDF method exec.
@SuppressWarnings("deprecation")
@Override
public DataBag exec(Tuple input) throws IOException {
DataBag bag = BagFactory.getInstance().newDefaultBag();
TupleFactory tupleFactory = TupleFactory.getInstance();
if (input == null) {
return null;
}
int size = input.size();
if (size == 0 || input.size() != this.columnConfigList.size()) {
log.error("the input size - " + input.size() + ", while column size - " + columnConfigList.size());
this.mismatchCnt++;
// this could make Shifu could skip some malformed data
if (this.mismatchCnt > MAX_MISMATCH_CNT) {
throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
}
return null;
}
if (input.get(tagColumnNum) == null) {
log.error("tagColumnNum is " + tagColumnNum + "; input size is " + input.size() + "; columnConfigList.size() is " + columnConfigList.size() + "; tuple is" + input.toDelimitedString("|") + "; tag is " + input.get(tagColumnNum));
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
String tag = CommonUtils.trimTag(input.get(tagColumnNum).toString());
if (this.isLinearTarget) {
if (!NumberUtils.isNumber(tag)) {
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
} else if (!super.tagSet.contains(tag)) {
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
Double rate = modelConfig.getBinningSampleRate();
if (!this.isLinearTarget && !modelConfig.isClassification() && modelConfig.isBinningSampleNegOnly()) {
if (super.negTagSet.contains(tag) && random.nextDouble() > rate) {
return null;
}
} else {
if (random.nextDouble() > rate) {
return null;
}
}
List<Boolean> filterResultList = null;
if (this.isForExpressions) {
filterResultList = new ArrayList<Boolean>();
for (int j = 0; j < this.dataPurifiers.size(); j++) {
DataPurifier dataPurifier = this.dataPurifiers.get(j);
filterResultList.add(dataPurifier.isFilter(input));
}
}
boolean isPositiveInst = (modelConfig.isRegression() && super.posTagSet.contains(tag));
for (int i = 0; i < size; i++) {
ColumnConfig config = columnConfigList.get(i);
if (!isValidRecord(modelConfig.isRegression(), isPositiveInst, config)) {
continue;
}
bag.add(buildTuple(input, tupleFactory, tag, i, i));
if (this.isForExpressions) {
for (int j = 0; j < this.dataPurifiers.size(); j++) {
Boolean isFilter = filterResultList.get(j);
if (isFilter != null && isFilter) {
bag.add(buildTuple(input, tupleFactory, tag, i, (j + 1) * size + i));
}
}
}
}
return bag;
}
use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.
the class UpdateBinningInfoMapper method map.
/**
* Mapper implementation includes: 1. Invalid data purifier 2. Column statistics update.
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String valueStr = value.toString();
if (valueStr == null || valueStr.length() == 0 || valueStr.trim().length() == 0) {
LOG.warn("Empty input.");
return;
}
context.getCounter(Constants.SHIFU_GROUP_COUNTER, "TOTAL_VALID_COUNT").increment(1L);
if (!this.dataPurifier.isFilter(valueStr)) {
context.getCounter(Constants.SHIFU_GROUP_COUNTER, "FILTER_OUT_COUNT").increment(1L);
return;
}
String[] units = CommonUtils.split(valueStr, this.dataSetDelimiter);
// tagColumnNum should be in units array, if not IndexOutofBoundException
if (units.length != this.columnConfigList.size()) {
LOG.error("Data column length doesn't match with ColumnConfig size. Just skip.");
return;
}
String tag = CommonUtils.trimTag(units[this.tagColumnNum]);
if (modelConfig.isRegression()) {
if (tag == null || (!posTags.contains(tag) && !negTags.contains(tag))) {
context.getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1L);
return;
}
} else {
if (tag == null || (!isLinearTarget && !tags.contains(tag))) {
context.getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1L);
return;
}
}
Double weight = 1.0;
try {
weight = (this.weightedColumnNum == -1 ? 1.0d : Double.valueOf(units[this.weightedColumnNum]));
if (weight < 0) {
weightExceptions += 1;
context.getCounter(Constants.SHIFU_GROUP_COUNTER, "WEIGHT_EXCEPTION").increment(1L);
if (weightExceptions > 5000 && this.isThrowforWeightException) {
throw new IllegalStateException("Please check weight column in eval, exceptional weight count is over 5000");
}
}
} catch (NumberFormatException e) {
weightExceptions += 1;
context.getCounter(Constants.SHIFU_GROUP_COUNTER, "WEIGHT_EXCEPTION").increment(1L);
if (weightExceptions > 5000 && this.isThrowforWeightException) {
throw new IllegalStateException("Please check weight column in eval, exceptional weight count is over 5000");
}
}
List<Boolean> filterResults = null;
if (this.isForExpressions) {
filterResults = new ArrayList<Boolean>();
for (int j = 0; j < this.expressionDataPurifiers.size(); j++) {
DataPurifier dp = this.expressionDataPurifiers.get(j);
filterResults.add(dp.isFilter(valueStr));
}
}
// valid data process
for (int i = 0; i < units.length; i++) {
populateStats(units, tag, weight, i, i);
if (this.isForExpressions) {
for (int j = 0; j < this.expressionDataPurifiers.size(); j++) {
Boolean filter = filterResults.get(j);
if (filter != null && filter) {
populateStats(units, tag, weight, i, (j + 1) * units.length + i);
}
}
}
}
}
use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.
the class ShifuTestProcessor method runFilterTest.
private int runFilterTest(EvalConfig evalConfig) throws IOException {
RawSourceData dataset = evalConfig.getDataSet();
if (StringUtils.isBlank(dataset.getFilterExpressions())) {
LOG.warn("No filter expression set in eval-{} dataset. Skip it!", evalConfig.getName());
return 0;
}
LOG.info("Start to test the filter against eval `{}` dataset.", evalConfig.getName());
DataPurifier dataPurifier = new DataPurifier(evalConfig);
return doFilterTest(dataPurifier, dataset.getDataPath(), dataset.getSource());
}
use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.
the class PostTrainMapper method setup.
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
protected void setup(Context context) throws IOException, InterruptedException {
loadConfigFiles(context);
loadTagWeightNum();
this.dataPurifier = new DataPurifier(this.modelConfig, false);
this.outputKey = new IntWritable();
this.outputValue = new Text();
this.tags = new HashSet<String>(modelConfig.getFlattenTags());
SourceType sourceType = this.modelConfig.getDataSet().getSource();
List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, null, sourceType);
this.headers = CommonUtils.getFinalHeaders(modelConfig);
this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, modelConfig.getDataSetDelimiter(), models);
this.mos = new MultipleOutputs<NullWritable, Text>((TaskInputOutputContext) context);
this.initFeatureStats();
}
use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.
the class FeatureImportanceMapper method setup.
@Override
protected void setup(Context context) throws IOException, InterruptedException {
loadConfigFiles(context);
loadTagWeightNum();
this.dataPurifier = new DataPurifier(this.modelConfig, false);
this.outputKey = new IntWritable();
this.outputValue = new DoubleWritable();
this.tags = new HashSet<String>(modelConfig.getFlattenTags());
this.headers = CommonUtils.getFinalHeaders(modelConfig);
this.initFeatureStats();
}
Aggregations