Search in sources :

Example 1 with MapReduceShuffle

use of ml.shifu.shifu.core.shuffle.MapReduceShuffle in project shifu by ShifuML.

the class BasicModelProcessor method runDataClean.

protected void runDataClean(boolean isToShuffle) throws IOException {
    SourceType sourceType = modelConfig.getDataSet().getSource();
    String cleanedDataPath = this.pathFinder.getCleanedDataPath();
    LOG.info("Start to generate clean data for tree model ... ");
    if (ShifuFileUtils.isFileExists(cleanedDataPath, sourceType)) {
        ShifuFileUtils.deleteFile(cleanedDataPath, sourceType);
    }
    Map<String, String> paramsMap = new HashMap<String, String>();
    paramsMap.put("sampleRate", modelConfig.getNormalizeSampleRate().toString());
    paramsMap.put("sampleNegOnly", ((Boolean) modelConfig.isNormalizeSampleNegOnly()).toString());
    paramsMap.put("delimiter", CommonUtils.escapePigString(modelConfig.getDataSetDelimiter()));
    paramsMap.put("is_csv", String.valueOf(Boolean.TRUE.toString().equalsIgnoreCase(Environment.getProperty(Constants.SHIFU_OUTPUT_DATA_CSV, Boolean.FALSE.toString()))));
    try {
        String normPigPath = pathFinder.getScriptPath("scripts/Normalize.pig");
        paramsMap.put(Constants.IS_COMPRESS, "true");
        paramsMap.put(Constants.IS_NORM_FOR_CLEAN, "true");
        paramsMap.put(Constants.PATH_NORMALIZED_DATA, pathFinder.getCleanedDataPath());
        PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap, sourceType, this.pathFinder);
        // cleaned validation data
        if (StringUtils.isNotBlank(modelConfig.getValidationDataSetRawPath())) {
            String cleandedValidationDataPath = pathFinder.getCleanedValidationDataPath();
            if (ShifuFileUtils.isFileExists(cleandedValidationDataPath, sourceType)) {
                ShifuFileUtils.deleteFile(cleandedValidationDataPath, sourceType);
            }
            paramsMap.put(Constants.IS_COMPRESS, "false");
            paramsMap.put(Constants.PATH_RAW_DATA, modelConfig.getValidationDataSetRawPath());
            paramsMap.put(Constants.PATH_NORMALIZED_DATA, pathFinder.getCleanedValidationDataPath());
            PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap, sourceType, this.pathFinder);
        }
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
    } catch (Throwable e) {
        throw new RuntimeException(e);
    }
    if (isToShuffle) {
        MapReduceShuffle shuffler = new MapReduceShuffle(this.modelConfig);
        try {
            shuffler.run(pathFinder.getCleanedDataPath());
        } catch (ClassNotFoundException e) {
            throw new RuntimeException("Fail to shuffle the cleaned data.", e);
        } catch (InterruptedException e) {
            throw new RuntimeException("Fail to shuffle the cleaned data.", e);
        }
    }
    LOG.info("Generate clean data for tree model successful.");
}
Also used : SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) MapReduceShuffle(ml.shifu.shifu.core.shuffle.MapReduceShuffle) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 2 with MapReduceShuffle

use of ml.shifu.shifu.core.shuffle.MapReduceShuffle in project shifu by ShifuML.

the class NormalizeModelProcessor method run.

/**
 * runner for normalization data
 */
@Override
public int run() throws Exception {
    log.info("Step Start: normalize");
    long start = System.currentTimeMillis();
    try {
        setUp(ModelStep.NORMALIZE);
        syncDataToHdfs(modelConfig.getDataSet().getSource());
        switch(modelConfig.getBasic().getRunMode()) {
            case DIST:
            case MAPRED:
                runPigNormalize();
                try {
                    autoCheckShuffleAndShuffleSize();
                } catch (Exception e) {
                    log.warn("warn: exception in auto check shuffle size, can be ignored as no big impact", e);
                }
                if (this.isToShuffleData) {
                    // shuffling normalized data, to make data random
                    MapReduceShuffle shuffler = new MapReduceShuffle(this.modelConfig);
                    shuffler.run(this.pathFinder.getNormalizedDataPath());
                }
                if (CommonUtils.isTreeModel(modelConfig.getAlgorithm())) {
                    runDataClean(this.isToShuffleData);
                }
                break;
            case LOCAL:
                runAkkaNormalize();
                break;
        }
        syncDataToHdfs(modelConfig.getDataSet().getSource());
        clearUp(ModelStep.NORMALIZE);
    } catch (ShifuException e) {
        log.error("Error:" + e.getError().toString() + "; msg:" + e.getMessage(), e);
        return -1;
    } catch (Exception e) {
        log.error("Error:" + e.getMessage(), e);
        return -1;
    }
    log.info("Step Finished: normalize with {} ms", (System.currentTimeMillis() - start));
    return 0;
}
Also used : MapReduceShuffle(ml.shifu.shifu.core.shuffle.MapReduceShuffle) ShifuException(ml.shifu.shifu.exception.ShifuException) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException)

Aggregations

MapReduceShuffle (ml.shifu.shifu.core.shuffle.MapReduceShuffle)2 ShifuException (ml.shifu.shifu.exception.ShifuException)2 IOException (java.io.IOException)1 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)1