Search in sources :

Example 1 with BufferedFloatMLDataSet

use of ml.shifu.shifu.core.dtrain.dataset.BufferedFloatMLDataSet in project shifu by ShifuML.

the class AbstractNNWorker method initDiskDataSet.

/**
 * For disk data set , initialize it with parameters and other work about creating files.
 *
 * @throws IOException
 *             if any exception on local fs operations.
 * @throws RuntimeException
 *             if error on deleting testing or training file.
 */
private void initDiskDataSet() throws IOException {
    Path trainingFile = DTrainUtils.getTrainingFile();
    Path testingFile = DTrainUtils.getTestingFile();
    LOG.debug("Use disk to store training data and testing data. Training data file:{}; Testing data file:{} ", trainingFile.toString(), testingFile.toString());
    this.trainingData = new BufferedFloatMLDataSet(new File(trainingFile.toString()));
    ((BufferedFloatMLDataSet) this.trainingData).beginLoad(this.featureInputsCnt, getOutputNodeCount());
    this.validationData = new BufferedFloatMLDataSet(new File(testingFile.toString()));
    ((BufferedFloatMLDataSet) this.validationData).beginLoad(this.featureInputsCnt, getOutputNodeCount());
}
Also used : Path(org.apache.hadoop.fs.Path) File(java.io.File) BufferedFloatMLDataSet(ml.shifu.shifu.core.dtrain.dataset.BufferedFloatMLDataSet)

Example 2 with BufferedFloatMLDataSet

use of ml.shifu.shifu.core.dtrain.dataset.BufferedFloatMLDataSet in project shifu by ShifuML.

the class AbstractNNWorker method init.

@Override
public void init(WorkerContext<NNParams, NNParams> context) {
    // load props firstly
    this.props = context.getProps();
    loadConfigFiles(context.getProps());
    this.trainerId = Integer.valueOf(context.getProps().getProperty(CommonConstants.SHIFU_TRAINER_ID, "0"));
    GridSearch gs = new GridSearch(modelConfig.getTrain().getParams(), modelConfig.getTrain().getGridConfigFileContent());
    this.validParams = this.modelConfig.getTrain().getParams();
    if (gs.hasHyperParam()) {
        this.validParams = gs.getParams(trainerId);
        LOG.info("Start grid search master with params: {}", validParams);
    }
    Integer kCrossValidation = this.modelConfig.getTrain().getNumKFold();
    if (kCrossValidation != null && kCrossValidation > 0) {
        isKFoldCV = true;
        LOG.info("Cross validation is enabled by kCrossValidation: {}.", kCrossValidation);
    }
    this.poissonSampler = Boolean.TRUE.toString().equalsIgnoreCase(context.getProps().getProperty(NNConstants.NN_POISON_SAMPLER));
    this.rng = new PoissonDistribution(1.0d);
    Double upSampleWeight = modelConfig.getTrain().getUpSampleWeight();
    if (Double.compare(upSampleWeight, 1d) != 0 && (modelConfig.isRegression() || (modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll()))) {
        // set mean to upSampleWeight -1 and get sample + 1to make sure no zero sample value
        LOG.info("Enable up sampling with weight {}.", upSampleWeight);
        this.upSampleRng = new PoissonDistribution(upSampleWeight - 1);
    }
    Integer epochsPerIterationInteger = this.modelConfig.getTrain().getEpochsPerIteration();
    this.epochsPerIteration = epochsPerIterationInteger == null ? 1 : epochsPerIterationInteger.intValue();
    LOG.info("epochsPerIteration in worker is :{}", epochsPerIteration);
    // Object elmObject = validParams.get(DTrainUtils.IS_ELM);
    // isELM = elmObject == null ? false : "true".equalsIgnoreCase(elmObject.toString());
    // LOG.info("Check isELM: {}", isELM);
    Object dropoutRateObj = validParams.get(CommonConstants.DROPOUT_RATE);
    if (dropoutRateObj != null) {
        this.dropoutRate = Double.valueOf(dropoutRateObj.toString());
    }
    LOG.info("'dropoutRate' in worker is :{}", this.dropoutRate);
    Object miniBatchO = validParams.get(CommonConstants.MINI_BATCH);
    if (miniBatchO != null) {
        int miniBatchs;
        try {
            miniBatchs = Integer.parseInt(miniBatchO.toString());
        } catch (Exception e) {
            miniBatchs = 1;
        }
        if (miniBatchs < 0) {
            this.batchs = 1;
        } else if (miniBatchs > 1000) {
            this.batchs = 1000;
        } else {
            this.batchs = miniBatchs;
        }
        LOG.info("'miniBatchs' in worker is : {}, batchs is {} ", miniBatchs, batchs);
    }
    int[] inputOutputIndex = DTrainUtils.getInputOutputCandidateCounts(modelConfig.getNormalizeType(), this.columnConfigList);
    this.inputNodeCount = inputOutputIndex[0] == 0 ? inputOutputIndex[2] : inputOutputIndex[0];
    // if is one vs all classification, outputNodeCount is set to 1, if classes=2, outputNodeCount is also 1
    int classes = modelConfig.getTags().size();
    this.outputNodeCount = (isLinearTarget || modelConfig.isRegression()) ? inputOutputIndex[1] : (modelConfig.getTrain().isOneVsAll() ? inputOutputIndex[1] : (classes == 2 ? 1 : classes));
    this.candidateCount = inputOutputIndex[2];
    boolean isAfterVarSelect = inputOutputIndex[0] != 0;
    LOG.info("isAfterVarSelect {}: Input count {}, output count {}, candidate count {}", isAfterVarSelect, inputNodeCount, outputNodeCount, candidateCount);
    // cache all feature list for sampling features
    this.allFeatures = NormalUtils.getAllFeatureList(columnConfigList, isAfterVarSelect);
    String subsetStr = context.getProps().getProperty(CommonConstants.SHIFU_NN_FEATURE_SUBSET);
    if (StringUtils.isBlank(subsetStr)) {
        this.subFeatures = this.allFeatures;
    } else {
        String[] splits = subsetStr.split(",");
        this.subFeatures = new ArrayList<Integer>(splits.length);
        for (String split : splits) {
            int featureIndex = Integer.parseInt(split);
            this.subFeatures.add(featureIndex);
        }
    }
    this.subFeatureSet = new HashSet<Integer>(this.subFeatures);
    LOG.info("subFeatures size is {}", subFeatures.size());
    this.featureInputsCnt = DTrainUtils.getFeatureInputsCnt(this.modelConfig, this.columnConfigList, this.subFeatureSet);
    this.wgtInit = "default";
    Object wgtInitObj = validParams.get(CommonConstants.WEIGHT_INITIALIZER);
    if (wgtInitObj != null) {
        this.wgtInit = wgtInitObj.toString();
    }
    Object lossObj = validParams.get("Loss");
    this.lossStr = lossObj != null ? lossObj.toString() : "squared";
    LOG.info("Loss str is {}", this.lossStr);
    this.isDry = Boolean.TRUE.toString().equalsIgnoreCase(context.getProps().getProperty(CommonConstants.SHIFU_DRY_DTRAIN));
    this.isSpecificValidation = (modelConfig.getValidationDataSetRawPath() != null && !"".equals(modelConfig.getValidationDataSetRawPath()));
    this.isStratifiedSampling = this.modelConfig.getTrain().getStratifiedSample();
    if (isOnDisk()) {
        LOG.info("NNWorker is loading data into disk.");
        try {
            initDiskDataSet();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        // cannot find a good place to close these two data set, using Shutdown hook
        Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {

            @Override
            public void run() {
                ((BufferedFloatMLDataSet) (AbstractNNWorker.this.trainingData)).close();
                ((BufferedFloatMLDataSet) (AbstractNNWorker.this.validationData)).close();
            }
        }));
    } else {
        LOG.info("NNWorker is loading data into memory.");
        double memoryFraction = Double.valueOf(context.getProps().getProperty("guagua.data.memoryFraction", "0.6"));
        long memoryStoreSize = (long) (Runtime.getRuntime().maxMemory() * memoryFraction);
        LOG.info("Max heap memory: {}, fraction: {}", Runtime.getRuntime().maxMemory(), memoryFraction);
        double crossValidationRate = this.modelConfig.getValidSetRate();
        try {
            if (StringUtils.isNotBlank(modelConfig.getValidationDataSetRawPath())) {
                // fixed 0.6 and 0.4 of max memory for trainingData and validationData
                this.trainingData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * 0.6), DTrainUtils.getTrainingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
                this.validationData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * 0.4), DTrainUtils.getTestingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
            } else {
                this.trainingData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * (1 - crossValidationRate)), DTrainUtils.getTrainingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
                this.validationData = new MemoryDiskFloatMLDataSet((long) (memoryStoreSize * crossValidationRate), DTrainUtils.getTestingFile().toString(), this.featureInputsCnt, this.outputNodeCount);
            }
            // cannot find a good place to close these two data set, using Shutdown hook
            Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {

                @Override
                public void run() {
                    ((MemoryDiskFloatMLDataSet) (AbstractNNWorker.this.trainingData)).close();
                    ((MemoryDiskFloatMLDataSet) (AbstractNNWorker.this.validationData)).close();
                }
            }));
        } catch (IOException e) {
            throw new GuaguaRuntimeException(e);
        }
    }
    // create Splitter
    String delimiter = context.getProps().getProperty(Constants.SHIFU_OUTPUT_DATA_DELIMITER);
    this.splitter = MapReduceUtils.generateShifuOutputSplitter(delimiter);
}
Also used : PoissonDistribution(org.apache.commons.math3.distribution.PoissonDistribution) MemoryDiskFloatMLDataSet(ml.shifu.shifu.core.dtrain.dataset.MemoryDiskFloatMLDataSet) IOException(java.io.IOException) IOException(java.io.IOException) GuaguaRuntimeException(ml.shifu.guagua.GuaguaRuntimeException) GridSearch(ml.shifu.shifu.core.dtrain.gs.GridSearch) GuaguaRuntimeException(ml.shifu.guagua.GuaguaRuntimeException) GuaguaRuntimeException(ml.shifu.guagua.GuaguaRuntimeException) BufferedFloatMLDataSet(ml.shifu.shifu.core.dtrain.dataset.BufferedFloatMLDataSet)

Aggregations

BufferedFloatMLDataSet (ml.shifu.shifu.core.dtrain.dataset.BufferedFloatMLDataSet)2 File (java.io.File)1 IOException (java.io.IOException)1 GuaguaRuntimeException (ml.shifu.guagua.GuaguaRuntimeException)1 MemoryDiskFloatMLDataSet (ml.shifu.shifu.core.dtrain.dataset.MemoryDiskFloatMLDataSet)1 GridSearch (ml.shifu.shifu.core.dtrain.gs.GridSearch)1 PoissonDistribution (org.apache.commons.math3.distribution.PoissonDistribution)1 Path (org.apache.hadoop.fs.Path)1