Search in sources :

Example 31 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class EvalModelProcessor method runDistScore.

/**
 * run pig mode scoring
 *
 * @param evalConfig
 *            the name for evaluation
 * @throws IOException
 *             any io exception
 */
@SuppressWarnings("deprecation")
private ScoreStatus runDistScore(EvalConfig evalConfig) throws IOException {
    // clean up output directories
    SourceType sourceType = evalConfig.getDataSet().getSource();
    ShifuFileUtils.deleteFile(pathFinder.getEvalNormalizedPath(evalConfig), sourceType);
    ShifuFileUtils.deleteFile(pathFinder.getEvalScorePath(evalConfig), sourceType);
    ShifuFileUtils.deleteFile(pathFinder.getEvalPerformancePath(evalConfig), sourceType);
    // prepare special parameters and execute pig
    Map<String, String> paramsMap = new HashMap<String, String>();
    paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString());
    paramsMap.put("pathEvalRawData", evalConfig.getDataSet().getDataPath());
    paramsMap.put("pathEvalNormalized", pathFinder.getEvalNormalizedPath(evalConfig));
    paramsMap.put("pathEvalScore", pathFinder.getEvalScorePath(evalConfig));
    paramsMap.put("pathEvalPerformance", pathFinder.getEvalPerformancePath(evalConfig));
    paramsMap.put("eval_set_name", evalConfig.getName());
    paramsMap.put("delimiter", CommonUtils.escapePigString(evalConfig.getDataSet().getDataDelimiter()));
    paramsMap.put("columnIndex", evalConfig.getPerformanceScoreSelector().trim());
    paramsMap.put("scale", Environment.getProperty(Constants.SHIFU_SCORE_SCALE, Integer.toString(Scorer.DEFAULT_SCORE_SCALE)));
    String expressionsAsString = super.modelConfig.getSegmentFilterExpressionsAsString();
    Environment.getProperties().put("shifu.segment.expressions", expressionsAsString);
    String pigScript = "scripts/Eval.pig";
    Map<String, String> confMap = new HashMap<String, String>();
    // max min score folder
    String maxMinScoreFolder = ShifuFileUtils.getFileSystemBySourceType(sourceType).makeQualified(new Path("tmp" + File.separator + "maxmin_score_" + System.currentTimeMillis() + "_" + RANDOM.nextLong())).toString();
    confMap.put(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT, maxMinScoreFolder);
    if (modelConfig.isClassification() || (isNoSort() && EvalStep.SCORE.equals(this.evalStep))) {
        pigScript = "scripts/EvalScore.pig";
    }
    try {
        PigExecutor.getExecutor().submitJob(modelConfig, pathFinder.getScriptPath(pigScript), paramsMap, evalConfig.getDataSet().getSource(), confMap, super.pathFinder);
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
    } catch (Throwable e) {
        throw new RuntimeException(e);
    }
    Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator();
    while (iter.hasNext()) {
        JobStats jobStats = iter.next();
        long evalRecords = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_RECORDS);
        LOG.info("Total valid eval records is : {}", evalRecords);
        // If no basic record counter, check next one
        if (evalRecords == 0L) {
            continue;
        }
        this.evalRecords = evalRecords;
        long pigPosTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_POSTAGS);
        long pigNegTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_NEGTAGS);
        double pigPosWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_WPOSTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d);
        double pigNegWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_WNEGTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d);
        LOG.info("Total positive record count is : {}", pigPosTags);
        LOG.info("Total negative record count is : {}", pigNegTags);
        LOG.info("Total weighted positive record count is : {}", pigPosWeightTags);
        LOG.info("Total weighted negative record count is : {}", pigNegWeightTags);
        long totalRunTime = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.TOTAL_MODEL_RUNTIME);
        LOG.info("Avg SLA for eval model scoring is {} micro seconds", totalRunTime / evalRecords);
        double maxScore = Integer.MIN_VALUE;
        double minScore = Integer.MAX_VALUE;
        if (modelConfig.isRegression()) {
            double[] maxMinScores = locateMaxMinScoreFromFile(sourceType, maxMinScoreFolder);
            maxScore = maxMinScores[0];
            minScore = maxMinScores[1];
            LOG.info("Raw max score is {}, raw min score is {}", maxScore, minScore);
            ShifuFileUtils.deleteFile(maxMinScoreFolder, sourceType);
        }
        // only one pig job with such counters, return
        return new ScoreStatus(pigPosTags, pigNegTags, pigPosWeightTags, pigNegWeightTags, maxScore, minScore, evalRecords);
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) IOException(java.io.IOException) JobStats(org.apache.pig.tools.pigstats.JobStats) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 32 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class EvalModelProcessor method runPigNormalize.

/**
 * Normalize evaluation dataset based on pig distributed solution.
 *
 * @param evalConfig
 *            eval config instance
 * @throws IOException
 *             any io exception
 */
private void runPigNormalize(EvalConfig evalConfig) throws IOException {
    SourceType sourceType = evalConfig.getDataSet().getSource();
    // clean up output directories
    ShifuFileUtils.deleteFile(pathFinder.getEvalNormalizedPath(evalConfig), sourceType);
    // prepare special parameters and execute pig
    Map<String, String> paramsMap = new HashMap<String, String>();
    paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString());
    paramsMap.put("pathEvalRawData", evalConfig.getDataSet().getDataPath());
    paramsMap.put("pathEvalNormalized", pathFinder.getEvalNormalizedPath(evalConfig));
    paramsMap.put("eval_set_name", evalConfig.getName());
    paramsMap.put("delimiter", evalConfig.getDataSet().getDataDelimiter());
    paramsMap.put("scale", Environment.getProperty(Constants.SHIFU_SCORE_SCALE, Integer.toString(Scorer.DEFAULT_SCORE_SCALE)));
    paramsMap.put(Constants.STRICT_MODE, Boolean.toString(isStrict()));
    String pigScript = "scripts/EvalNorm.pig";
    try {
        PigExecutor.getExecutor().submitJob(modelConfig, pathFinder.getScriptPath(pigScript), paramsMap, evalConfig.getDataSet().getSource());
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
    } catch (Throwable e) {
        throw new RuntimeException(e);
    }
}
Also used : HashMap(java.util.HashMap) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 33 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class EvalModelProcessor method createNewEval.

/**
 * Create a evaluation with <code>name</code>
 *
 * @param name
 *            - the evaluation set name
 * @throws IOException
 *             any io exception
 */
private void createNewEval(String name) throws IOException {
    EvalConfig evalConfig = modelConfig.getEvalConfigByName(name);
    if (evalConfig != null) {
        throw new ShifuException(ShifuErrorCode.ERROR_MODEL_EVALSET_ALREADY_EXIST, "EvalSet - " + name + " already exists in ModelConfig. Please use another evalset name");
    }
    evalConfig = new EvalConfig();
    evalConfig.setName(name);
    evalConfig.setDataSet(modelConfig.getDataSet().cloneRawSourceData());
    // create empty <EvalSetName>Score.meta.column.names
    ShifuFileUtils.createFileIfNotExists(new Path(evalConfig.getName() + Constants.DEFAULT_CHAMPIONSCORE_META_COLUMN_FILE).toString(), SourceType.LOCAL);
    // create empty <EvalSetName>.meta.column.names
    String namesFilePath = Constants.COLUMN_META_FOLDER_NAME + File.separator + evalConfig.getName() + "." + Constants.DEFAULT_META_COLUMN_FILE;
    ShifuFileUtils.createFileIfNotExists(new Path(namesFilePath).toString(), SourceType.LOCAL);
    evalConfig.getDataSet().setMetaColumnNameFile(namesFilePath);
    modelConfig.getEvals().add(evalConfig);
    try {
        saveModelConfig();
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_WRITE_MODELCONFIG, e);
    }
    LOG.info("Create Eval - " + name);
}
Also used : EvalConfig(ml.shifu.shifu.container.obj.EvalConfig) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 34 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class NormalizeModelProcessor method runPigNormalize.

/**
 * Running pig normalize process
 *
 * @throws IOException
 *             any IO exception.
 */
@SuppressWarnings("deprecation")
private void runPigNormalize() throws IOException {
    SourceType sourceType = modelConfig.getDataSet().getSource();
    ShifuFileUtils.deleteFile(pathFinder.getNormalizedDataPath(), sourceType);
    ShifuFileUtils.deleteFile(pathFinder.getSelectedRawDataPath(), sourceType);
    Map<String, String> paramsMap = new HashMap<String, String>();
    paramsMap.put("sampleRate", modelConfig.getNormalizeSampleRate().toString());
    paramsMap.put("sampleNegOnly", ((Boolean) modelConfig.isNormalizeSampleNegOnly()).toString());
    paramsMap.put("delimiter", CommonUtils.escapePigString(modelConfig.getDataSetDelimiter()));
    paramsMap.put("is_csv", String.valueOf(Boolean.TRUE.toString().equalsIgnoreCase(Environment.getProperty(Constants.SHIFU_OUTPUT_DATA_CSV, Boolean.FALSE.toString()))));
    String expressionsAsString = super.modelConfig.getSegmentFilterExpressionsAsString();
    Environment.getProperties().put("shifu.segment.expressions", expressionsAsString);
    try {
        String normPigPath = null;
        if (modelConfig.getNormalize().getIsParquet()) {
            if (modelConfig.getBasic().getPostTrainOn()) {
                normPigPath = pathFinder.getScriptPath("scripts/NormalizeWithParquetAndPostTrain.pig");
            } else {
                log.info("Post train is disabled by 'postTrainOn=false'.");
                normPigPath = pathFinder.getScriptPath("scripts/NormalizeWithParquet.pig");
            }
        } else {
            if (modelConfig.getBasic().getPostTrainOn()) {
            // this condition is for comment, no matter post train enabled or not, only norm results will be
            // stored since new post train solution no need to prepare data
            }
            normPigPath = pathFinder.getScriptPath("scripts/Normalize.pig");
        }
        paramsMap.put(Constants.IS_COMPRESS, "true");
        paramsMap.put(Constants.IS_NORM_FOR_CLEAN, "false");
        PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap);
        Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator();
        while (iter.hasNext()) {
            JobStats jobStats = iter.next();
            if (jobStats.getHadoopCounters() != null && jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) != null) {
                long totalValidCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("TOTAL_VALID_COUNT");
                // If no basic record counter, check next one
                if (totalValidCount == 0L) {
                    continue;
                }
                long invalidTagCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("INVALID_TAG");
                log.info("Total valid records {} after filtering, invalid tag records {}.", totalValidCount, invalidTagCount);
                if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) {
                    log.error("Too many invalid tags, please check you configuration on positive tags and negative tags.");
                }
            }
            // only one pig job with such counters, break
            break;
        }
        if (StringUtils.isNotBlank(modelConfig.getValidationDataSetRawPath())) {
            ShifuFileUtils.deleteFile(pathFinder.getNormalizedValidationDataPath(), sourceType);
            paramsMap.put(Constants.IS_COMPRESS, "false");
            paramsMap.put(Constants.IS_VALIDATION_DATASET, "true");
            paramsMap.put(Constants.PATH_RAW_DATA, modelConfig.getValidationDataSetRawPath());
            paramsMap.put(Constants.PATH_NORMALIZED_DATA, pathFinder.getNormalizedValidationDataPath());
            PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap);
        }
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
    } catch (Throwable e) {
        throw new RuntimeException(e);
    }
}
Also used : HashMap(java.util.HashMap) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException) JobStats(org.apache.pig.tools.pigstats.JobStats)

Example 35 with ShifuException

use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.

the class NormalizeModelProcessor method run.

/**
 * runner for normalization data
 */
@Override
public int run() throws Exception {
    log.info("Step Start: normalize");
    long start = System.currentTimeMillis();
    try {
        setUp(ModelStep.NORMALIZE);
        syncDataToHdfs(modelConfig.getDataSet().getSource());
        switch(modelConfig.getBasic().getRunMode()) {
            case DIST:
            case MAPRED:
                runPigNormalize();
                try {
                    autoCheckShuffleAndShuffleSize();
                } catch (Exception e) {
                    log.warn("warn: exception in auto check shuffle size, can be ignored as no big impact", e);
                }
                if (this.isToShuffleData) {
                    // shuffling normalized data, to make data random
                    MapReduceShuffle shuffler = new MapReduceShuffle(this.modelConfig);
                    shuffler.run(this.pathFinder.getNormalizedDataPath());
                }
                if (CommonUtils.isTreeModel(modelConfig.getAlgorithm())) {
                    runDataClean(this.isToShuffleData);
                }
                break;
            case LOCAL:
                runAkkaNormalize();
                break;
        }
        syncDataToHdfs(modelConfig.getDataSet().getSource());
        clearUp(ModelStep.NORMALIZE);
    } catch (ShifuException e) {
        log.error("Error:" + e.getError().toString() + "; msg:" + e.getMessage(), e);
        return -1;
    } catch (Exception e) {
        log.error("Error:" + e.getMessage(), e);
        return -1;
    }
    log.info("Step Finished: normalize with {} ms", (System.currentTimeMillis() - start));
    return 0;
}
Also used : MapReduceShuffle(ml.shifu.shifu.core.shuffle.MapReduceShuffle) ShifuException(ml.shifu.shifu.exception.ShifuException) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Aggregations

ShifuException (ml.shifu.shifu.exception.ShifuException)39 IOException (java.io.IOException)22 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)12 HashMap (java.util.HashMap)8 ArrayList (java.util.ArrayList)5 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)5 File (java.io.File)4 Scanner (java.util.Scanner)4 Path (org.apache.hadoop.fs.Path)4 SourceFile (ml.shifu.shifu.fs.SourceFile)3 JobStats (org.apache.pig.tools.pigstats.JobStats)3 BufferedReader (java.io.BufferedReader)2 ConfusionMatrixObject (ml.shifu.shifu.container.ConfusionMatrixObject)2 EvalConfig (ml.shifu.shifu.container.obj.EvalConfig)2 RawSourceData (ml.shifu.shifu.container.obj.RawSourceData)2 AbstractStatsExecutor (ml.shifu.shifu.core.processor.stats.AbstractStatsExecutor)2 AkkaStatsWorker (ml.shifu.shifu.core.processor.stats.AkkaStatsWorker)2 DIBStatsExecutor (ml.shifu.shifu.core.processor.stats.DIBStatsExecutor)2 MunroPatIStatsExecutor (ml.shifu.shifu.core.processor.stats.MunroPatIStatsExecutor)2 MunroPatStatsExecutor (ml.shifu.shifu.core.processor.stats.MunroPatStatsExecutor)2