Search in sources :

Example 21 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class NormalizeUDF method exec.

@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
        return null;
    }
    Object tag = input.get(tagColumnNum);
    if (tag == null) {
        log.warn("The tag is NULL, just skip it!!");
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    final String rawTag = CommonUtils.trimTag(tag.toString());
    // make sure all invalid tag record are filter out
    if (!isLinearTarget && !super.tagSet.contains(rawTag)) {
        if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
            PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
        }
        return null;
    }
    // if(!isLinearTarget && !this.isForClean) {
    if (!isLinearTarget) {
        // do data sampling. Unselected data or data with invalid tag will be filtered out.
        boolean isNotSampled = DataSampler.isNotSampled(modelConfig.isRegression(), super.tagSet, super.posTagSet, super.negTagSet, modelConfig.getNormalizeSampleRate(), modelConfig.isNormalizeSampleNegOnly(), rawTag);
        if (isNotSampled) {
            return null;
        }
    }
    // append tuple with tag, normalized value.
    Tuple tuple = TupleFactory.getInstance().newTuple();
    final NormType normType = modelConfig.getNormalizeType();
    Map<String, Object> compactVarMap = null;
    if (this.isCompactNorm) {
        compactVarMap = new HashMap<String, Object>();
    }
    if (!this.isForExpressions) {
        if (input.size() != this.columnConfigList.size()) {
            this.mismatchCnt++;
            log.error("the input size - " + input.size() + ", while column size - " + columnConfigList.size());
            this.mismatchCnt++;
            // this could make Shifu could skip some malformed data
            if (this.mismatchCnt > MAX_MISMATCH_CNT) {
                throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
            }
            return null;
        }
        for (int i = 0; i < input.size(); i++) {
            ColumnConfig config = columnConfigList.get(i);
            String val = (input.get(i) == null) ? "" : input.get(i).toString();
            // load variables for weight calculating.
            if (weightExpr != null) {
                weightContext.set(new NSColumn(config.getColumnName()).getSimpleName(), val);
            }
            // check tag type.
            if (tagColumnNum == i) {
                if (modelConfig.isRegression()) {
                    int type = 0;
                    if (super.posTagSet.contains(rawTag)) {
                        type = 1;
                    } else if (super.negTagSet.contains(rawTag)) {
                        type = 0;
                    } else {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
                    } else {
                        tuple.append(type);
                    }
                } else if (this.isLinearTarget) {
                    double tagValue = 0.0;
                    try {
                        tagValue = Double.parseDouble(rawTag);
                    } catch (Exception e) {
                        log.error("Tag - " + rawTag + " is invalid(not numerical). Skip record.");
                        // skip this line
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), tagValue);
                    } else {
                        tuple.append(tagValue);
                    }
                } else {
                    int index = -1;
                    for (int j = 0; j < tags.size(); j++) {
                        Set<String> tagSet = tags.get(j);
                        if (tagSet.contains(rawTag)) {
                            index = j;
                            break;
                        }
                    }
                    if (index == -1) {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
                    } else {
                        tuple.append(index);
                    }
                }
                continue;
            }
            if (this.isForClean) {
                // for RF/GBT model, only clean data, not real do norm data
                if (config.isCategorical()) {
                    Map<String, Integer> map = this.categoricalIndexMap.get(config.getColumnNum());
                    // map should not be null, no need check if map is null, if val not in binCategory, set it to ""
                    tuple.append(((map.get(val) == null || map.get(val) == -1)) ? "" : val);
                } else {
                    Double normVal = 0d;
                    try {
                        normVal = Double.parseDouble(val);
                    } catch (Exception e) {
                        log.debug("Not decimal format " + val + ", using default!");
                        normVal = Normalizer.defaultMissingValue(config);
                    }
                    appendOutputValue(tuple, normVal, true);
                }
            } else {
                if (this.isCompactNorm) {
                    // only output features and target, weight in compact norm mode
                    if (!config.isMeta() && config.isFinalSelect()) {
                        // for multiple classification, binPosRate means rate of such category over all counts,
                        // reuse binPosRate for normalize
                        List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                        for (Double normVal : normVals) {
                            String formatVal = getOutputValue(normVal, true);
                            compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
                        }
                    } else if (config.isMeta()) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
                    } else {
                    // if is compact mode but such column is not final selected, should be empty, as only append
                    // target and finalSelect feature, no need append here so this code block is empty. TODO, do
                    // we need meta column?
                    }
                } else {
                    // it will cause variable fail to normalize
                    if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
                        // for multiple classification, binPosRate means rate of such category over all counts,
                        // reuse binPosRate for normalize
                        List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                        for (Double normVal : normVals) {
                            appendOutputValue(tuple, normVal, true);
                        }
                    } else {
                        tuple.append(config.isMeta() ? val : null);
                    }
                }
            }
        }
    } else {
        // for segment expansion variables
        int rawSize = input.size();
        for (int i = 0; i < this.columnConfigList.size(); i++) {
            ColumnConfig config = this.columnConfigList.get(i);
            int newIndex = i >= rawSize ? i % rawSize : i;
            String val = (input.get(newIndex) == null) ? "" : input.get(newIndex).toString();
            // for target column
            if (config.isTarget()) {
                if (modelConfig.isRegression()) {
                    int type = 0;
                    if (super.posTagSet.contains(rawTag)) {
                        type = 1;
                    } else if (super.negTagSet.contains(rawTag)) {
                        type = 0;
                    } else {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
                    } else {
                        tuple.append(type);
                    }
                } else {
                    int index = -1;
                    for (int j = 0; j < tags.size(); j++) {
                        Set<String> tagSet = tags.get(j);
                        if (tagSet.contains(rawTag)) {
                            index = j;
                            break;
                        }
                    }
                    if (index == -1) {
                        log.error("Invalid data! The target value is not listed - " + rawTag);
                        warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
                        return null;
                    }
                    if (this.isCompactNorm) {
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
                    } else {
                        tuple.append(index);
                    }
                }
                continue;
            }
            if (this.isCompactNorm) {
                // only output features and target, weight in compact norm mode
                if (!config.isMeta() && config.isFinalSelect()) {
                    // for multiple classification, binPosRate means rate of such category over all counts,
                    // reuse binPosRate for normalize
                    List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                    for (Double normVal : normVals) {
                        String formatVal = getOutputValue(normVal, true);
                        compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
                    }
                } else if (config.isMeta()) {
                    compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
                } else {
                // if is compact mode but such column is not final selected, should be empty, as only append
                // target and finalSelect feature, no need append here so this code block is empty. TODO, do
                // we need meta column?
                }
            } else {
                // for others
                if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
                    List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
                    for (Double normVal : normVals) {
                        appendOutputValue(tuple, normVal, true);
                    }
                } else {
                    tuple.append(config.isMeta() ? val : null);
                }
            }
        }
    }
    // for compact norm mode, output to tuple at here
    if (this.isCompactNorm) {
        for (int i = 0; i < outputCompactColumns.size(); i++) {
            tuple.append(compactVarMap.get(outputCompactColumns.get(i)));
        }
    }
    // append tuple with weight.
    double weight = evaluateWeight(weightExpr, weightContext);
    tuple.append(weight);
    return tuple;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) NormType(ml.shifu.shifu.container.obj.ModelNormalizeConf.NormType) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException) ShifuException(ml.shifu.shifu.exception.ShifuException) Tuple(org.apache.pig.data.Tuple) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 22 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class CommonUtils method getHeaders.

/**
 * Return header column array from header file.
 *
 * @param pathHeader
 *            header path
 * @param delimiter
 *            the delimiter of headers
 * @param sourceType
 *            source type: hdfs or local
 * @param isFull
 *            if full header name including name space
 * @return headers array
 * @throws IOException
 *             if any IO exception in reading file.
 *
 * @throws IllegalArgumentException
 *             if sourceType is null, if pathHeader is null or empty, if delimiter is null or empty.
 *
 * @throws RuntimeException
 *             if first line of pathHeader is null or empty.
 */
public static String[] getHeaders(String pathHeader, String delimiter, SourceType sourceType, boolean isFull) throws IOException {
    if (StringUtils.isEmpty(pathHeader) || StringUtils.isEmpty(delimiter) || sourceType == null) {
        throw new IllegalArgumentException(String.format("Null or empty parameters srcDataPath:%s, dstDataPath:%s, sourceType:%s", pathHeader, delimiter, sourceType));
    }
    BufferedReader reader = null;
    String pigHeaderStr = null;
    try {
        reader = ShifuFileUtils.getReader(pathHeader, sourceType);
        pigHeaderStr = reader.readLine();
        if (StringUtils.isEmpty(pigHeaderStr)) {
            throw new RuntimeException(String.format("Cannot reade header info from the first line of file: %s", pathHeader));
        }
    } catch (Exception e) {
        log.error("Error in getReader, this must be catched in this method to make sure the next reader can be returned.", e);
        throw new ShifuException(ShifuErrorCode.ERROR_HEADER_NOT_FOUND);
    } finally {
        IOUtils.closeQuietly(reader);
    }
    List<String> headerList = new ArrayList<String>();
    Set<String> headerSet = new HashSet<String>();
    int index = 0;
    for (String str : Splitter.on(delimiter).split(pigHeaderStr)) {
        String columnName = StringUtils.trimToEmpty(str);
        if (!Environment.getBoolean(Constants.SHIFU_NAMESPACE_STRICT_MODE, false)) {
            columnName = getRelativePigHeaderColumnName(str);
        }
        /*
             * if(isFull) {
             * columnName = getFullPigHeaderColumnName(str);
             * } else {
             * columnName = getRelativePigHeaderColumnName(str);
             * }
             */
        if (headerSet.contains(columnName)) {
            columnName = columnName + "_" + index;
        }
        columnName = normColumnName(columnName);
        headerSet.add(columnName);
        index++;
        headerList.add(columnName);
    }
    return headerList.toArray(new String[0]);
}
Also used : ShifuException(ml.shifu.shifu.exception.ShifuException) ExecException(org.apache.pig.backend.executionengine.ExecException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 23 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class HDFSUtils method getFS.

/*
     * Get HDFS FileSystem
     */
public static FileSystem getFS() {
    if (hdfs == null) {
        synchronized (HDFSUtils.class) {
            if (hdfs == null) {
                try {
                    // initialization
                    // Assign to the hdfs instance after the tmpHdfs instance initialization fully complete.
                    // Avoid hdfs instance being used before fully initializaion.
                    FileSystem tmpHdfs = FileSystem.get(conf);
                    tmpHdfs.setVerifyChecksum(false);
                    hdfs = tmpHdfs;
                } catch (IOException e) {
                    LOG.error("Error on creating hdfs FileSystem object.", e);
                    throw new ShifuException(ShifuErrorCode.ERROR_GET_HDFS_SYSTEM);
                }
            }
        }
    }
    return hdfs;
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 24 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class StatsStep method process.

/*
     * (non-Javadoc)
     * 
     * @see ml.shifu.common.Step#process()
     */
@Override
public List<ColumnConfig> process() throws IOException {
    LOG.info("Step Start: stats");
    long start = System.currentTimeMillis();
    try {
        // User may change variable type after `shifu init`
        ColumnConfigUpdater.updateColumnConfigFlags(this.modelConfig, this.columnConfigList, ModelStep.STATS);
        LOG.info("Saving ModelConfig, ColumnConfig and then upload to HDFS ...");
        JSONUtils.writeValue(new File(pathFinder.getModelConfigPath(SourceType.LOCAL)), modelConfig);
        JSONUtils.writeValue(new File(pathFinder.getColumnConfigPath(SourceType.LOCAL)), columnConfigList);
        if (SourceType.HDFS.equals(modelConfig.getDataSet().getSource())) {
            CommonUtils.copyConfFromLocalToHDFS(modelConfig, this.pathFinder);
        }
        AbstractStatsExecutor statsExecutor = null;
        if (modelConfig.isMapReduceRunMode()) {
            if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.DynamicBinning)) {
                statsExecutor = new DIBStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPat)) {
                statsExecutor = new MunroPatStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPatI)) {
                statsExecutor = new MunroPatIStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDT)) {
                statsExecutor = new SPDTStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            } else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDTI)) {
                statsExecutor = new SPDTIStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            } else {
                statsExecutor = new SPDTIStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
            }
        } else if (modelConfig.isLocalRunMode()) {
            statsExecutor = new AkkaStatsWorker(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
        } else {
            throw new ShifuException(ShifuErrorCode.ERROR_UNSUPPORT_MODE);
        }
        statsExecutor.doStats();
        if (SourceType.HDFS.equals(modelConfig.getDataSet().getSource())) {
            CommonUtils.copyConfFromLocalToHDFS(modelConfig, this.pathFinder);
        }
    } catch (Exception e) {
        LOG.error("Error:", e);
    }
    LOG.info("Step Finished: stats with {} ms", (System.currentTimeMillis() - start));
    return columnConfigList;
}
Also used : DIBStatsExecutor(ml.shifu.shifu.core.processor.stats.DIBStatsExecutor) MunroPatIStatsExecutor(ml.shifu.shifu.core.processor.stats.MunroPatIStatsExecutor) SPDTIStatsExecutor(ml.shifu.shifu.core.processor.stats.SPDTIStatsExecutor) BasicModelProcessor(ml.shifu.shifu.core.processor.BasicModelProcessor) SPDTStatsExecutor(ml.shifu.shifu.core.processor.stats.SPDTStatsExecutor) AbstractStatsExecutor(ml.shifu.shifu.core.processor.stats.AbstractStatsExecutor) MunroPatStatsExecutor(ml.shifu.shifu.core.processor.stats.MunroPatStatsExecutor) AkkaStatsWorker(ml.shifu.shifu.core.processor.stats.AkkaStatsWorker) File(java.io.File) ShifuException(ml.shifu.shifu.exception.ShifuException) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 25 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class DataPrepareWorker method convertRawDataIntoValueObject.

/*
     * Convert raw data into @ValueObject for calculating stats
     * 
     * @param rawDataList
     *            - raw data for training
     * @param columnVoListMap
     *            <column-id --> @ValueObject list>
     * @throws ShifuException
     *             if the data field length is not equal header length
     */
private DataPrepareStatsResult convertRawDataIntoValueObject(List<String> rawDataList, Map<Integer, List<ValueObject>> columnVoListMap) throws ShifuException {
    double sampleRate = modelConfig.getBinningSampleRate();
    long total = 0l;
    Map<Integer, Long> missingMap = new HashMap<Integer, Long>();
    for (String line : rawDataList) {
        total++;
        String[] raw = CommonUtils.split(line, modelConfig.getDataSetDelimiter());
        if (raw.length != columnConfigList.size()) {
            log.error("Expected Columns: " + columnConfigList.size() + ", but got: " + raw.length);
            throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
        }
        String tag = CommonUtils.trimTag(raw[targetColumnNum]);
        if (modelConfig.isBinningSampleNegOnly()) {
            if (modelConfig.getNegTags().contains(tag) && random.nextDouble() > sampleRate) {
                continue;
            }
        } else {
            if (random.nextDouble() > sampleRate) {
                continue;
            }
        }
        for (int i = 0; i < raw.length; i++) {
            if (!columnNumToActorMap.containsKey(i)) {
                // ignore non-used columns
                continue;
            }
            ValueObject vo = new ValueObject();
            if (i >= columnConfigList.size()) {
                log.error("The input size is longer than expected, need to check your data");
                continue;
            }
            ColumnConfig config = columnConfigList.get(i);
            if (config.isNumerical()) {
                // NUMERICAL
                try {
                    vo.setValue(Double.valueOf(raw[i].trim()));
                    vo.setRaw(null);
                } catch (Exception e) {
                    log.debug("Column " + config.getColumnNum() + ": " + config.getColumnName() + " is expected to be NUMERICAL, however received: " + raw[i]);
                    incMap(i, missingMap);
                    continue;
                }
            } else if (config.isCategorical()) {
                // CATEGORICAL
                if (raw[i] == null || StringUtils.isEmpty(raw[i]) || modelConfig.getDataSet().getMissingOrInvalidValues().contains(raw[i].toLowerCase().trim())) {
                    incMap(i, missingMap);
                }
                vo.setRaw(raw[i].trim());
                vo.setValue(null);
            } else {
                // AUTO TYPE
                try {
                    vo.setValue(Double.valueOf(raw[i]));
                    vo.setRaw(null);
                } catch (Exception e) {
                    incMap(i, missingMap);
                    vo.setRaw(raw[i]);
                    vo.setValue(null);
                }
            }
            if (this.weightedColumnNum != -1) {
                try {
                    vo.setWeight(Double.valueOf(raw[weightedColumnNum]));
                } catch (NumberFormatException e) {
                    vo.setWeight(1.0);
                }
                vo.setWeight(1.0);
            }
            vo.setTag(tag);
            List<ValueObject> voList = columnVoListMap.get(i);
            if (voList == null) {
                voList = new ArrayList<ValueObject>();
                columnVoListMap.put(i, voList);
            }
            voList.add(vo);
        }
    }
    DataPrepareStatsResult rt = new DataPrepareStatsResult(total, missingMap);
    return rt;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) HashMap(java.util.HashMap) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException) ValueObject(ml.shifu.shifu.container.ValueObject) ShifuException(ml.shifu.shifu.exception.ShifuException)

Aggregations

ShifuException (ml.shifu.shifu.exception.ShifuException)39 IOException (java.io.IOException)22 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)12 HashMap (java.util.HashMap)8 ArrayList (java.util.ArrayList)5 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)5 File (java.io.File)4 Scanner (java.util.Scanner)4 Path (org.apache.hadoop.fs.Path)4 SourceFile (ml.shifu.shifu.fs.SourceFile)3 JobStats (org.apache.pig.tools.pigstats.JobStats)3 BufferedReader (java.io.BufferedReader)2 ConfusionMatrixObject (ml.shifu.shifu.container.ConfusionMatrixObject)2 EvalConfig (ml.shifu.shifu.container.obj.EvalConfig)2 RawSourceData (ml.shifu.shifu.container.obj.RawSourceData)2 AbstractStatsExecutor (ml.shifu.shifu.core.processor.stats.AbstractStatsExecutor)2 AkkaStatsWorker (ml.shifu.shifu.core.processor.stats.AkkaStatsWorker)2 DIBStatsExecutor (ml.shifu.shifu.core.processor.stats.DIBStatsExecutor)2 MunroPatIStatsExecutor (ml.shifu.shifu.core.processor.stats.MunroPatIStatsExecutor)2 MunroPatStatsExecutor (ml.shifu.shifu.core.processor.stats.MunroPatStatsExecutor)2