use of ml.shifu.shifu.core.shuffle.MapReduceShuffle in project shifu by ShifuML.
the class BasicModelProcessor method runDataClean.
protected void runDataClean(boolean isToShuffle) throws IOException {
SourceType sourceType = modelConfig.getDataSet().getSource();
String cleanedDataPath = this.pathFinder.getCleanedDataPath();
LOG.info("Start to generate clean data for tree model ... ");
if (ShifuFileUtils.isFileExists(cleanedDataPath, sourceType)) {
ShifuFileUtils.deleteFile(cleanedDataPath, sourceType);
}
Map<String, String> paramsMap = new HashMap<String, String>();
paramsMap.put("sampleRate", modelConfig.getNormalizeSampleRate().toString());
paramsMap.put("sampleNegOnly", ((Boolean) modelConfig.isNormalizeSampleNegOnly()).toString());
paramsMap.put("delimiter", CommonUtils.escapePigString(modelConfig.getDataSetDelimiter()));
paramsMap.put("is_csv", String.valueOf(Boolean.TRUE.toString().equalsIgnoreCase(Environment.getProperty(Constants.SHIFU_OUTPUT_DATA_CSV, Boolean.FALSE.toString()))));
try {
String normPigPath = pathFinder.getScriptPath("scripts/Normalize.pig");
paramsMap.put(Constants.IS_COMPRESS, "true");
paramsMap.put(Constants.IS_NORM_FOR_CLEAN, "true");
paramsMap.put(Constants.PATH_NORMALIZED_DATA, pathFinder.getCleanedDataPath());
PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap, sourceType, this.pathFinder);
// cleaned validation data
if (StringUtils.isNotBlank(modelConfig.getValidationDataSetRawPath())) {
String cleandedValidationDataPath = pathFinder.getCleanedValidationDataPath();
if (ShifuFileUtils.isFileExists(cleandedValidationDataPath, sourceType)) {
ShifuFileUtils.deleteFile(cleandedValidationDataPath, sourceType);
}
paramsMap.put(Constants.IS_COMPRESS, "false");
paramsMap.put(Constants.PATH_RAW_DATA, modelConfig.getValidationDataSetRawPath());
paramsMap.put(Constants.PATH_NORMALIZED_DATA, pathFinder.getCleanedValidationDataPath());
PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap, sourceType, this.pathFinder);
}
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
} catch (Throwable e) {
throw new RuntimeException(e);
}
if (isToShuffle) {
MapReduceShuffle shuffler = new MapReduceShuffle(this.modelConfig);
try {
shuffler.run(pathFinder.getCleanedDataPath());
} catch (ClassNotFoundException e) {
throw new RuntimeException("Fail to shuffle the cleaned data.", e);
} catch (InterruptedException e) {
throw new RuntimeException("Fail to shuffle the cleaned data.", e);
}
}
LOG.info("Generate clean data for tree model successful.");
}
use of ml.shifu.shifu.core.shuffle.MapReduceShuffle in project shifu by ShifuML.
the class NormalizeModelProcessor method run.
/**
* runner for normalization data
*/
@Override
public int run() throws Exception {
log.info("Step Start: normalize");
long start = System.currentTimeMillis();
try {
setUp(ModelStep.NORMALIZE);
syncDataToHdfs(modelConfig.getDataSet().getSource());
switch(modelConfig.getBasic().getRunMode()) {
case DIST:
case MAPRED:
runPigNormalize();
try {
autoCheckShuffleAndShuffleSize();
} catch (Exception e) {
log.warn("warn: exception in auto check shuffle size, can be ignored as no big impact", e);
}
if (this.isToShuffleData) {
// shuffling normalized data, to make data random
MapReduceShuffle shuffler = new MapReduceShuffle(this.modelConfig);
shuffler.run(this.pathFinder.getNormalizedDataPath());
}
if (CommonUtils.isTreeModel(modelConfig.getAlgorithm())) {
runDataClean(this.isToShuffleData);
}
break;
case LOCAL:
runAkkaNormalize();
break;
}
syncDataToHdfs(modelConfig.getDataSet().getSource());
clearUp(ModelStep.NORMALIZE);
} catch (ShifuException e) {
log.error("Error:" + e.getError().toString() + "; msg:" + e.getMessage(), e);
return -1;
} catch (Exception e) {
log.error("Error:" + e.getMessage(), e);
return -1;
}
log.info("Step Finished: normalize with {} ms", (System.currentTimeMillis() - start));
return 0;
}
Aggregations