use of ml.shifu.shifu.core.dtrain.dataset.BufferedFloatMLDataSet in project shifu by ShifuML.
the class AbstractNNWorker method initDiskDataSet.
/**
* For disk data set , initialize it with parameters and other work about creating files.
*
* @throws IOException
* if any exception on local fs operations.
* @throws RuntimeException
* if error on deleting testing or training file.
*/
private void initDiskDataSet() throws IOException {
Path trainingFile = DTrainUtils.getTrainingFile();
Path testingFile = DTrainUtils.getTestingFile();
LOG.debug("Use disk to store training data and testing data. Training data file:{}; Testing data file:{} ", trainingFile.toString(), testingFile.toString());
this.trainingData = new BufferedFloatMLDataSet(new File(trainingFile.toString()));
((BufferedFloatMLDataSet) this.trainingData).beginLoad(this.featureInputsCnt, getOutputNodeCount());
this.validationData = new BufferedFloatMLDataSet(new File(testingFile.toString()));
((BufferedFloatMLDataSet) this.validationData).beginLoad(this.featureInputsCnt, getOutputNodeCount());
}
use of ml.shifu.shifu.core.dtrain.dataset.BufferedFloatMLDataSet in project shifu by ShifuML.
the class AbstractNNWorker method init.
@Override
public void init(WorkerContext<NNParams, NNParams> context) {
// load props firstly
this.props = context.getProps();
loadConfigFiles(context.getProps());
this.trainerId = Integer.valueOf(context.getProps().getProperty(CommonConstants.SHIFU_TRAINER_ID, "0"));
GridSearch gs = new GridSearch(modelConfig.getTrain().getParams(), modelConfig.getTrain().getGridConfigFileContent());
this.validParams = this.modelConfig.getTrain().getParams();
if (gs.hasHyperParam()) {
this.validParams = gs.getParams(trainerId);
LOG.info("Start grid search master with params: {}", validParams);
}
Integer kCrossValidation = this.modelConfig.getTrain().getNumKFold();
if (kCrossValidation != null && kCrossValidation > 0) {
isKFoldCV = true;
LOG.info("Cross validation is enabled by kCrossValidation: {}.", kCrossValidation);
}
this.poissonSampler = Boolean.TRUE.toString().equalsIgnoreCase(context.getProps().getProperty(NNConstants.NN_POISON_SAMPLER));
this.rng = new PoissonDistribution(1.0d);
Double upSampleWeight = modelConfig.getTrain().getUpSampleWeight();
if (Double.compare(upSampleWeight, 1d) != 0 && (modelConfig.isRegression() || (modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll()))) {
// set mean to upSampleWeight -1 and get sample + 1to make sure no zero sample value
LOG.info("Enable up sampling with weight {}.", upSampleWeight);
this.upSampleRng = new PoissonDistribution(upSampleWeight - 1);
}
Integer epochsPerIterationInteger = this.modelConfig.getTrain().getEpochsPerIteration();
this.epochsPerIteration = epochsPerIterationInteger == null ? 1 : epochsPerIterationInteger.intValue();
LOG.info("epochsPerIteration in worker is :{}", epochsPerIteration);
// Object elmObject = validParams.get(DTrainUtils.IS_ELM);
// isELM = elmObject == null ? false : "true".equalsIgnoreCase(elmObject.toString());
// LOG.info("Check isELM: {}", isELM);
Object dropoutRateObj = validParams.get(CommonConstants.DROPOUT_RATE);
if (dropoutRateObj != null) {
this.dropoutRate = Double.valueOf(dropoutRateObj.toString());
}
LOG.info("'dropoutRate' in worker is :{}", this.dropoutRate);
Object miniBatchO = validParams.get(CommonConstants.MINI_BATCH);
if (miniBatchO != null) {
int miniBatchs;
try {
miniBatchs = Integer.parseInt(miniBatchO.toString());
} catch (Exception e) {
miniBatchs = 1;
}
if (miniBatchs < 0) {
this.batchs = 1;
} else if (miniBatchs > 1000) {
this.batchs = 1000;
} else {
this.batchs = miniBatchs;
}
LOG.info("'miniBatchs' in worker is : {}, batchs is {} ", miniBatchs, batchs);
}
int[] inputOutputIndex = DTrainUtils.getInputOutputCandidateCounts(modelConfig.getNormalizeType(), this.columnConfigList);
this.inputNodeCount = inputOutputIndex[0] == 0 ? inputOutputIndex[2] : inputOutputIndex[0];
// if is one vs all classification, outputNodeCount is set to 1, if classes=2, outputNodeCount is also 1
int classes = modelConfig.getTags().size();
this.outputNodeCount = (isLinearTarget || modelConfig.isRegression()) ? inputOutputIndex[1] : (modelConfig.getTrain().isOneVsAll() ? inputOutputIndex[1] : (classes == 2 ? 1 : classes));
this.candidateCount = inputOutputIndex[2];
boolean isAfterVarSelect = inputOutputIndex[0] != 0;
LOG.info("isAfterVarSelect {}: Input count {}, output count {}, candidate count {}", isAfterVarSelect, inputNodeCount, outputNodeCount, candidateCount);
// cache all feature list for sampling features
this.allFeatures = NormalUtils.getAllFeatureList(columnConfigList, isAfterVarSelect);
String subsetStr = context.getProps().getProperty(CommonConstants.SHIFU_NN_FEATURE_SUBSET);
if (StringUtils.isBlank(subsetStr)) {
this.subFeatures = this.allFeatures;
} else {
String[] splits = subsetStr.split(",");
this.subFeatures = new ArrayList<Integer>(splits.length);
for (String split : splits) {
int featureIndex = Integer.parseInt(split);
this.subFeatures.add(featureIndex);
}
}
this.subFeatureSet = new HashSet<Integer>(this.subFeatures);
LOG.info("subFeatures size is {}", subFeatures.size());
this.featureInputsCnt = DTrainUtils.getFeatureInputsCnt(this.modelConfig, this.columnConfigList, this.subFeatureSet);
this.wgtInit = "default";
Object wgtInitObj = validParams.get(CommonConstants.WEIGHT_INITIALIZER);
if (wgtInitObj != null) {
this.wgtInit = wgtInitObj.toString();
}
Object lossObj = validParams.get("Loss");
this.lossStr = lossObj != null ? lossObj.toString() : "squared";
LOG.info("Loss str is {}", this.lossStr);
this.isDry = Boolean.TRUE.toString().equalsIgnoreCase(context.getProps().getProperty(CommonConstants.SHIFU_DRY_DTRAIN));
this.isSpecificValidation = (modelConfig.getValidationDataSetRawPath() != null && !"".equals(modelConfig.getValidationDataSetRawPath()));
this.isStratifiedSampling = this.modelConfig.getTrain().getStratifiedSample();
if (isOnDisk()) {
LOG.info("NNWorker is loading data into disk.");
try {
initDiskDataSet();
} catch (IOException e) {
throw new RuntimeException(e);
}
// cannot find a good place to close these two data set, using Shutdown hook
Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
@Override
public void run() {
((BufferedFloatMLDataSet) (AbstractNNWorker.this.trainingData)).close();
((BufferedFloatMLDataSet) (AbstractNNWorker.this.validationData)).close();
}
}));
} else {
LOG.info("NNWorker is loading data into memory.");
double memoryFraction = Double.valueOf(context.getProps().getProperty("guagua.data.memoryFraction", "0.6"));
long memoryStoreSize = (long) (Runtime.getRuntime().maxMemory() * memoryFraction);
LOG.info("Max heap memory: {}, fraction: {}", Runtime.getRuntime().maxMemory(), memoryFraction);
double crossValidationRate = this.modelConfig.getValidSetRate();
try {
if (StringUtils.isNotBlank(modelConfig.getValidationDataSetRawPath())) {
// fixed 0.6 and 0.4 of max memory for trainingData and validationData
this.trainingData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * 0.6), DTrainUtils.getTrainingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
this.validationData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * 0.4), DTrainUtils.getTestingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
} else {
this.trainingData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * (1 - crossValidationRate)), DTrainUtils.getTrainingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
this.validationData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * crossValidationRate), DTrainUtils.getTestingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
}
// cannot find a good place to close these two data set, using Shutdown hook
Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
@Override
public void run() {
((MemoryDiskFloatMLDataSet) (AbstractNNWorker.this.trainingData)).close();
((MemoryDiskFloatMLDataSet) (AbstractNNWorker.this.validationData)).close();
}
}));
} catch (IOException e) {
throw new GuaguaRuntimeException(e);
}
}
// create Splitter
String delimiter = context.getProps().getProperty(Constants.SHIFU_OUTPUT_DATA_DELIMITER);
this.splitter = MapReduceUtils.generateShifuOutputSplitter(delimiter);
}
Aggregations