use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.
the class VarSelWorker method init.
@Override
public void init(WorkerContext<VarSelMasterResult, VarSelWorkerResult> workerContext) {
Properties props = workerContext.getProps();
try {
RawSourceData.SourceType sourceType = RawSourceData.SourceType.valueOf(props.getProperty(CommonConstants.MODELSET_SOURCE_TYPE, RawSourceData.SourceType.HDFS.toString()));
this.modelConfig = CommonUtils.loadModelConfig(props.getProperty(CommonConstants.SHIFU_MODEL_CONFIG), sourceType);
this.columnConfigList = CommonUtils.loadColumnConfigList(props.getProperty(CommonConstants.SHIFU_COLUMN_CONFIG), sourceType);
String conductorClsName = props.getProperty(Constants.VAR_SEL_WORKER_CONDUCTOR);
this.workerConductor = (AbstractWorkerConductor) Class.forName(conductorClsName).getDeclaredConstructor(ModelConfig.class, List.class).newInstance(this.modelConfig, this.columnConfigList);
} catch (IOException e) {
throw new RuntimeException("Fail to load ModelConfig or List<ColumnConfig>", e);
} catch (ClassNotFoundException e) {
throw new RuntimeException("Invalid Master Conductor class", e);
} catch (InstantiationException e) {
throw new RuntimeException("Fail to create instance", e);
} catch (IllegalAccessException e) {
throw new RuntimeException("Illegal access when creating instance", e);
} catch (NoSuchMethodException e) {
throw new RuntimeException("Fail to call method when creating instance", e);
} catch (InvocationTargetException e) {
throw new RuntimeException("Fail to invoke when creating instance", e);
}
List<Integer> normalizedColumnIdList = this.getNormalizedColumnIdList();
this.inputNodeCount = normalizedColumnIdList.size();
this.outputNodeCount = this.getTargetColumnCount();
trainingDataSet = new TrainingDataSet(normalizedColumnIdList);
try {
dataPurifier = new DataPurifier(modelConfig, false);
} catch (IOException e) {
throw new RuntimeException("Fail to create DataPurifier", e);
}
this.targetColumnId = CommonUtils.getTargetColumnNum(this.columnConfigList);
if (StringUtils.isNotBlank(modelConfig.getWeightColumnName())) {
for (ColumnConfig columnConfig : columnConfigList) {
if (columnConfig.getColumnName().equalsIgnoreCase(modelConfig.getWeightColumnName().trim())) {
this.weightColumnId = columnConfig.getColumnNum();
break;
}
}
}
}
use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.
the class FastCorrelationMapper method setup.
@Override
protected void setup(Context context) throws IOException, InterruptedException {
loadConfigFiles(context);
this.dataSetDelimiter = modelConfig.getDataSetDelimiter();
this.dataPurifier = new DataPurifier(modelConfig, false);
this.isComputeAll = Boolean.valueOf(context.getConfiguration().get(Constants.SHIFU_CORRELATION_COMPUTE_ALL, "false"));
this.outputKey = new IntWritable();
this.correlationMap = new HashMap<Integer, CorrelationWritable>();
for (ColumnConfig config : columnConfigList) {
if (config.isCategorical()) {
Map<String, Integer> map = new HashMap<String, Integer>();
if (config.getBinCategory() != null) {
for (int i = 0; i < config.getBinCategory().size(); i++) {
List<String> cvals = CommonUtils.flattenCatValGrp(config.getBinCategory().get(i));
for (String cval : cvals) {
map.put(cval, i);
}
}
}
this.categoricalIndexMap.put(config.getColumnNum(), map);
}
}
if (modelConfig != null && modelConfig.getPosTags() != null) {
this.posTagSet = new HashSet<String>(modelConfig.getPosTags());
}
if (modelConfig != null && modelConfig.getNegTags() != null) {
this.negTagSet = new HashSet<String>(modelConfig.getNegTags());
}
if (modelConfig != null && modelConfig.getFlattenTags() != null) {
this.tagSet = new HashSet<String>(modelConfig.getFlattenTags());
}
if (modelConfig != null) {
this.tags = modelConfig.getSetTags();
}
}
use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.
the class UpdateBinningInfoMapper method setup.
/**
* Initialization for column statistics in mapper.
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
loadConfigFiles(context);
this.dataSetDelimiter = this.modelConfig.getDataSetDelimiter();
this.dataPurifier = new DataPurifier(this.modelConfig, false);
String filterExpressions = context.getConfiguration().get(Constants.SHIFU_STATS_FILTER_EXPRESSIONS);
if (StringUtils.isNotBlank(filterExpressions)) {
this.isForExpressions = true;
String[] splits = CommonUtils.split(filterExpressions, Constants.SHIFU_STATS_FILTER_EXPRESSIONS_DELIMETER);
this.expressionDataPurifiers = new ArrayList<DataPurifier>(splits.length);
for (String split : splits) {
this.expressionDataPurifiers.add(new DataPurifier(modelConfig, split, false));
}
}
loadWeightColumnNum();
loadTagWeightNum();
this.columnBinningInfo = new HashMap<Integer, BinningInfoWritable>(this.columnConfigList.size(), 1f);
this.categoricalBinMap = new HashMap<Integer, Map<String, Integer>>(this.columnConfigList.size(), 1f);
// create Splitter
String delimiter = context.getConfiguration().get(Constants.SHIFU_OUTPUT_DATA_DELIMITER);
this.splitter = MapReduceUtils.generateShifuOutputSplitter(delimiter);
loadColumnBinningInfo();
this.outputKey = new IntWritable();
this.variableCountMap = new HashMap<Integer, CountAndFrequentItems>();
this.posTags = new HashSet<String>(modelConfig.getPosTags());
this.negTags = new HashSet<String>(modelConfig.getNegTags());
this.tags = new HashSet<String>(modelConfig.getFlattenTags());
this.missingOrInvalidValues = new HashSet<String>(this.modelConfig.getDataSet().getMissingOrInvalidValues());
this.isThrowforWeightException = "true".equalsIgnoreCase(context.getConfiguration().get("shifu.weight.exception", "false"));
LOG.debug("Column binning info: {}", this.columnBinningInfo);
this.isLinearTarget = (CollectionUtils.isEmpty(modelConfig.getTags()) && CommonUtils.getTargetColumnConfig(columnConfigList).isNumerical());
}
use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.
the class AutoTypeDistinctCountMapper method setup.
@Override
protected void setup(Context context) throws IOException, InterruptedException {
loadConfigFiles(context);
this.dataPurifier = new DataPurifier(this.modelConfig, false);
loadTagWeightNum();
this.variableCountMap = new HashMap<Integer, CountAndFrequentItems>();
this.outputKey = new IntWritable();
this.tags = new HashSet<String>(modelConfig.getFlattenTags());
this.missingOrInvalidValues = new HashSet<String>(this.modelConfig.getDataSet().getMissingOrInvalidValues());
}
use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.
the class CorrelationMapper method setup.
@Override
protected void setup(Context context) throws IOException, InterruptedException {
loadConfigFiles(context);
this.dataSetDelimiter = modelConfig.getDataSetDelimiter();
this.dataPurifier = new DataPurifier(modelConfig, false);
this.isComputeAll = Boolean.valueOf(context.getConfiguration().get(Constants.SHIFU_CORRELATION_COMPUTE_ALL, "false"));
for (ColumnConfig config : columnConfigList) {
if (config.isCategorical()) {
Map<String, Integer> map = new HashMap<String, Integer>();
if (config.getBinCategory() != null) {
for (int i = 0; i < config.getBinCategory().size(); i++) {
List<String> cvals = CommonUtils.flattenCatValGrp(config.getBinCategory().get(i));
for (String cval : cvals) {
map.put(cval, i);
}
}
}
this.categoricalIndexMap.put(config.getColumnNum(), map);
}
}
if (modelConfig != null && modelConfig.getPosTags() != null) {
this.posTagSet = new HashSet<String>(modelConfig.getPosTags());
}
if (modelConfig != null && modelConfig.getNegTags() != null) {
this.negTagSet = new HashSet<String>(modelConfig.getNegTags());
}
if (modelConfig != null && modelConfig.getFlattenTags() != null) {
this.tagSet = new HashSet<String>(modelConfig.getFlattenTags());
}
if (modelConfig != null) {
this.tags = modelConfig.getSetTags();
}
}
Aggregations