Search in sources :

Example 6 with DataPurifier

use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.

the class VarSelWorker method init.

@Override
public void init(WorkerContext<VarSelMasterResult, VarSelWorkerResult> workerContext) {
    Properties props = workerContext.getProps();
    try {
        RawSourceData.SourceType sourceType = RawSourceData.SourceType.valueOf(props.getProperty(CommonConstants.MODELSET_SOURCE_TYPE, RawSourceData.SourceType.HDFS.toString()));
        this.modelConfig = CommonUtils.loadModelConfig(props.getProperty(CommonConstants.SHIFU_MODEL_CONFIG), sourceType);
        this.columnConfigList = CommonUtils.loadColumnConfigList(props.getProperty(CommonConstants.SHIFU_COLUMN_CONFIG), sourceType);
        String conductorClsName = props.getProperty(Constants.VAR_SEL_WORKER_CONDUCTOR);
        this.workerConductor = (AbstractWorkerConductor) Class.forName(conductorClsName).getDeclaredConstructor(ModelConfig.class, List.class).newInstance(this.modelConfig, this.columnConfigList);
    } catch (IOException e) {
        throw new RuntimeException("Fail to load ModelConfig or List<ColumnConfig>", e);
    } catch (ClassNotFoundException e) {
        throw new RuntimeException("Invalid Master Conductor class", e);
    } catch (InstantiationException e) {
        throw new RuntimeException("Fail to create instance", e);
    } catch (IllegalAccessException e) {
        throw new RuntimeException("Illegal access when creating instance", e);
    } catch (NoSuchMethodException e) {
        throw new RuntimeException("Fail to call method when creating instance", e);
    } catch (InvocationTargetException e) {
        throw new RuntimeException("Fail to invoke when creating instance", e);
    }
    List<Integer> normalizedColumnIdList = this.getNormalizedColumnIdList();
    this.inputNodeCount = normalizedColumnIdList.size();
    this.outputNodeCount = this.getTargetColumnCount();
    trainingDataSet = new TrainingDataSet(normalizedColumnIdList);
    try {
        dataPurifier = new DataPurifier(modelConfig, false);
    } catch (IOException e) {
        throw new RuntimeException("Fail to create DataPurifier", e);
    }
    this.targetColumnId = CommonUtils.getTargetColumnNum(this.columnConfigList);
    if (StringUtils.isNotBlank(modelConfig.getWeightColumnName())) {
        for (ColumnConfig columnConfig : columnConfigList) {
            if (columnConfig.getColumnName().equalsIgnoreCase(modelConfig.getWeightColumnName().trim())) {
                this.weightColumnId = columnConfig.getColumnNum();
                break;
            }
        }
    }
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) IOException(java.io.IOException) Properties(java.util.Properties) RawSourceData(ml.shifu.shifu.container.obj.RawSourceData) InvocationTargetException(java.lang.reflect.InvocationTargetException) ModelConfig(ml.shifu.shifu.container.obj.ModelConfig) DataPurifier(ml.shifu.shifu.core.DataPurifier) TrainingDataSet(ml.shifu.shifu.core.dvarsel.dataset.TrainingDataSet) ArrayList(java.util.ArrayList) List(java.util.List)

Example 7 with DataPurifier

use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.

the class FastCorrelationMapper method setup.

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    loadConfigFiles(context);
    this.dataSetDelimiter = modelConfig.getDataSetDelimiter();
    this.dataPurifier = new DataPurifier(modelConfig, false);
    this.isComputeAll = Boolean.valueOf(context.getConfiguration().get(Constants.SHIFU_CORRELATION_COMPUTE_ALL, "false"));
    this.outputKey = new IntWritable();
    this.correlationMap = new HashMap<Integer, CorrelationWritable>();
    for (ColumnConfig config : columnConfigList) {
        if (config.isCategorical()) {
            Map<String, Integer> map = new HashMap<String, Integer>();
            if (config.getBinCategory() != null) {
                for (int i = 0; i < config.getBinCategory().size(); i++) {
                    List<String> cvals = CommonUtils.flattenCatValGrp(config.getBinCategory().get(i));
                    for (String cval : cvals) {
                        map.put(cval, i);
                    }
                }
            }
            this.categoricalIndexMap.put(config.getColumnNum(), map);
        }
    }
    if (modelConfig != null && modelConfig.getPosTags() != null) {
        this.posTagSet = new HashSet<String>(modelConfig.getPosTags());
    }
    if (modelConfig != null && modelConfig.getNegTags() != null) {
        this.negTagSet = new HashSet<String>(modelConfig.getNegTags());
    }
    if (modelConfig != null && modelConfig.getFlattenTags() != null) {
        this.tagSet = new HashSet<String>(modelConfig.getFlattenTags());
    }
    if (modelConfig != null) {
        this.tags = modelConfig.getSetTags();
    }
}
Also used : DataPurifier(ml.shifu.shifu.core.DataPurifier) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) HashMap(java.util.HashMap) IntWritable(org.apache.hadoop.io.IntWritable)

Example 8 with DataPurifier

use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.

the class UpdateBinningInfoMapper method setup.

/**
 * Initialization for column statistics in mapper.
 */
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    loadConfigFiles(context);
    this.dataSetDelimiter = this.modelConfig.getDataSetDelimiter();
    this.dataPurifier = new DataPurifier(this.modelConfig, false);
    String filterExpressions = context.getConfiguration().get(Constants.SHIFU_STATS_FILTER_EXPRESSIONS);
    if (StringUtils.isNotBlank(filterExpressions)) {
        this.isForExpressions = true;
        String[] splits = CommonUtils.split(filterExpressions, Constants.SHIFU_STATS_FILTER_EXPRESSIONS_DELIMETER);
        this.expressionDataPurifiers = new ArrayList<DataPurifier>(splits.length);
        for (String split : splits) {
            this.expressionDataPurifiers.add(new DataPurifier(modelConfig, split, false));
        }
    }
    loadWeightColumnNum();
    loadTagWeightNum();
    this.columnBinningInfo = new HashMap<Integer, BinningInfoWritable>(this.columnConfigList.size(), 1f);
    this.categoricalBinMap = new HashMap<Integer, Map<String, Integer>>(this.columnConfigList.size(), 1f);
    // create Splitter
    String delimiter = context.getConfiguration().get(Constants.SHIFU_OUTPUT_DATA_DELIMITER);
    this.splitter = MapReduceUtils.generateShifuOutputSplitter(delimiter);
    loadColumnBinningInfo();
    this.outputKey = new IntWritable();
    this.variableCountMap = new HashMap<Integer, CountAndFrequentItems>();
    this.posTags = new HashSet<String>(modelConfig.getPosTags());
    this.negTags = new HashSet<String>(modelConfig.getNegTags());
    this.tags = new HashSet<String>(modelConfig.getFlattenTags());
    this.missingOrInvalidValues = new HashSet<String>(this.modelConfig.getDataSet().getMissingOrInvalidValues());
    this.isThrowforWeightException = "true".equalsIgnoreCase(context.getConfiguration().get("shifu.weight.exception", "false"));
    LOG.debug("Column binning info: {}", this.columnBinningInfo);
    this.isLinearTarget = (CollectionUtils.isEmpty(modelConfig.getTags()) && CommonUtils.getTargetColumnConfig(columnConfigList).isNumerical());
}
Also used : CountAndFrequentItems(ml.shifu.shifu.core.autotype.AutoTypeDistinctCountMapper.CountAndFrequentItems) DataPurifier(ml.shifu.shifu.core.DataPurifier) HashMap(java.util.HashMap) Map(java.util.Map) IntWritable(org.apache.hadoop.io.IntWritable)

Example 9 with DataPurifier

use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.

the class AutoTypeDistinctCountMapper method setup.

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    loadConfigFiles(context);
    this.dataPurifier = new DataPurifier(this.modelConfig, false);
    loadTagWeightNum();
    this.variableCountMap = new HashMap<Integer, CountAndFrequentItems>();
    this.outputKey = new IntWritable();
    this.tags = new HashSet<String>(modelConfig.getFlattenTags());
    this.missingOrInvalidValues = new HashSet<String>(this.modelConfig.getDataSet().getMissingOrInvalidValues());
}
Also used : DataPurifier(ml.shifu.shifu.core.DataPurifier) IntWritable(org.apache.hadoop.io.IntWritable)

Example 10 with DataPurifier

use of ml.shifu.shifu.core.DataPurifier in project shifu by ShifuML.

the class CorrelationMapper method setup.

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    loadConfigFiles(context);
    this.dataSetDelimiter = modelConfig.getDataSetDelimiter();
    this.dataPurifier = new DataPurifier(modelConfig, false);
    this.isComputeAll = Boolean.valueOf(context.getConfiguration().get(Constants.SHIFU_CORRELATION_COMPUTE_ALL, "false"));
    for (ColumnConfig config : columnConfigList) {
        if (config.isCategorical()) {
            Map<String, Integer> map = new HashMap<String, Integer>();
            if (config.getBinCategory() != null) {
                for (int i = 0; i < config.getBinCategory().size(); i++) {
                    List<String> cvals = CommonUtils.flattenCatValGrp(config.getBinCategory().get(i));
                    for (String cval : cvals) {
                        map.put(cval, i);
                    }
                }
            }
            this.categoricalIndexMap.put(config.getColumnNum(), map);
        }
    }
    if (modelConfig != null && modelConfig.getPosTags() != null) {
        this.posTagSet = new HashSet<String>(modelConfig.getPosTags());
    }
    if (modelConfig != null && modelConfig.getNegTags() != null) {
        this.negTagSet = new HashSet<String>(modelConfig.getNegTags());
    }
    if (modelConfig != null && modelConfig.getFlattenTags() != null) {
        this.tagSet = new HashSet<String>(modelConfig.getFlattenTags());
    }
    if (modelConfig != null) {
        this.tags = modelConfig.getSetTags();
    }
}
Also used : DataPurifier(ml.shifu.shifu.core.DataPurifier) ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) HashMap(java.util.HashMap)

Aggregations

DataPurifier (ml.shifu.shifu.core.DataPurifier)11 IntWritable (org.apache.hadoop.io.IntWritable)5 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)4 HashMap (java.util.HashMap)3 RawSourceData (ml.shifu.shifu.container.obj.RawSourceData)2 IOException (java.io.IOException)1 InvocationTargetException (java.lang.reflect.InvocationTargetException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 Properties (java.util.Properties)1 ModelConfig (ml.shifu.shifu.container.obj.ModelConfig)1 ModelSourceDataConf (ml.shifu.shifu.container.obj.ModelSourceDataConf)1 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)1 ModelRunner (ml.shifu.shifu.core.ModelRunner)1 CountAndFrequentItems (ml.shifu.shifu.core.autotype.AutoTypeDistinctCountMapper.CountAndFrequentItems)1 TrainingDataSet (ml.shifu.shifu.core.dvarsel.dataset.TrainingDataSet)1 ShifuException (ml.shifu.shifu.exception.ShifuException)1 DoubleWritable (org.apache.hadoop.io.DoubleWritable)1 NullWritable (org.apache.hadoop.io.NullWritable)1