Examples with FrameSplitter - hex.FrameSplitter

Example 1 with FrameSplitter

use of hex.FrameSplitter in project h2o-3 by h2oai.

the class GBMMissingTest method run.

@Test
public void run() {
    long seed = 1234;
    GBMModel mymodel = null;
    Frame train = null;
    Frame test = null;
    Frame data = null;
    GBMModel.GBMParameters p;
    Log.info("");
    Log.info("STARTING.");
    Log.info("Using seed " + seed);
    StringBuilder sb = new StringBuilder();
    double sumerr = 0;
    Map<Double, Double> map = new TreeMap<>();
    for (double missing_fraction : new double[] { 0, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99 }) {
        double err = 0;
        try {
            Scope.enter();
            NFSFileVec nfs = TestUtil.makeNfsFileVec("smalldata/junit/weather.csv");
            data = ParseDataset.parse(Key.make("data.hex"), nfs._key);
            Log.info("FrameSplitting");
            // Create holdout test data on clean data (before adding missing values)
            FrameSplitter fs = new FrameSplitter(data, new double[] { 0.75f }, generateNumKeys(data._key, 2), null);
            //.join();
            H2O.submitTask(fs);
            Frame[] train_test = fs.getResult();
            train = train_test[0];
            test = train_test[1];
            Log.info("Done...");
            // add missing values to the training data (excluding the response)
            if (missing_fraction > 0) {
                Frame frtmp = new Frame(Key.<Frame>make(), train.names(), train.vecs());
                //exclude the response
                frtmp.remove(frtmp.numCols() - 1);
                //need to put the frame (to be modified) into DKV for MissingInserter to pick up
                DKV.put(frtmp._key, frtmp);
                FrameUtils.MissingInserter j = new FrameUtils.MissingInserter(frtmp._key, seed, missing_fraction);
                //MissingInserter is non-blocking, must block here explicitly
                j.execImpl().get();
                //Delete the frame header (not the data)
                DKV.remove(frtmp._key);
            }
            // Build a regularized GBM model with polluted training data, score on clean validation set
            p = new GBMModel.GBMParameters();
            p._train = train._key;
            p._valid = test._key;
            p._response_column = train._names[train.numCols() - 1];
            //only for weather data
            p._ignored_columns = new String[] { train._names[1], train._names[22] };
            p._seed = seed;
            // Convert response to categorical
            int ri = train.numCols() - 1;
            int ci = test.find(p._response_column);
            Scope.track(train.replace(ri, train.vecs()[ri].toCategoricalVec()));
            Scope.track(test.replace(ci, test.vecs()[ci].toCategoricalVec()));
            DKV.put(train);
            DKV.put(test);
            GBM gbm = new GBM(p);
            Log.info("Starting with " + missing_fraction * 100 + "% missing values added.");
            mymodel = gbm.trainModel().get();
            // Extract the scoring on validation set from the model
            err = ((ModelMetricsBinomial) mymodel._output._validation_metrics).logloss();
            Frame train_preds = mymodel.score(train);
            Assert.assertTrue(mymodel.testJavaScoring(train, train_preds, 1e-15));
            train_preds.remove();
            Log.info("Missing " + missing_fraction * 100 + "% -> logloss: " + err);
        } catch (Throwable t) {
            t.printStackTrace();
            err = 100;
        } finally {
            Scope.exit();
            // cleanup
            if (mymodel != null) {
                mymodel.delete();
            }
            if (train != null)
                train.delete();
            if (test != null)
                test.delete();
            if (data != null)
                data.delete();
        }
        map.put(missing_fraction, err);
        sumerr += err;
    }
    sb.append("missing fraction --> Error\n");
    for (String s : Arrays.toString(map.entrySet().toArray()).split(",")) sb.append(s.replace("=", " --> ")).append("\n");
    sb.append('\n');
    sb.append("Sum Err: ").append(sumerr).append("\n");
    Log.info(sb.toString());
}

Also used : FrameUtils(water.util.FrameUtils) Frame(water.fvec.Frame) NFSFileVec(water.fvec.NFSFileVec) TreeMap(java.util.TreeMap) FrameSplitter(hex.FrameSplitter) Test(org.junit.Test)

Example 2 with FrameSplitter

use of hex.FrameSplitter in project h2o-2 by h2oai.

the class Expr2Test method rbindTest.

@Test
public void rbindTest() {
    Key dest1 = Key.make("f1");
    float[] ratios = arf(0.5f);
    Frame[] splits = null;
    File file1 = TestUtil.find_test_file("smalldata/tnc3_10.csv");
    //File file = TestUtil.find_test_file("smalldata/iris/iris_wheader.csv");
    //File file = TestUtil.find_test_file("smalldata/cars.csv");
    Key fkey1 = NFSFileVec.make(file1);
    Frame f = ParseDataset2.parse(dest1, new Key[] { fkey1 });
    FrameSplitter fs = new FrameSplitter(f, ratios);
    H2O.submitTask(fs).join();
    splits = fs.getResult();
    Frame rbinded_frame;
    Env ev = Exec2.exec("rbind(" + splits[0]._key + "," + splits[1]._key + ")");
    try {
        rbinded_frame = ev.popAry();
    } finally {
        if (ev != null)
            ev.remove_and_unlock();
    }
    assertEquals(rbinded_frame.numRows(), f.numRows());
    rbinded_frame.delete();
    Lockable.delete(dest1);
    for (Frame s : splits) if (s != null)
        s.delete();
}

Also used : Frame(water.fvec.Frame) FrameSplitter(hex.FrameSplitter) File(java.io.File) Key(water.Key) Test(org.junit.Test)

Example 3 with FrameSplitter

use of hex.FrameSplitter in project h2o-3 by h2oai.

the class DeepLearningMissingTest method run.

@Test
public void run() {
    long seed = 1234;
    DeepLearningModel mymodel = null;
    Frame train = null;
    Frame test = null;
    Frame data = null;
    DeepLearningParameters p;
    Log.info("");
    Log.info("STARTING.");
    Log.info("Using seed " + seed);
    Map<DeepLearningParameters.MissingValuesHandling, Double> sumErr = new TreeMap<>();
    StringBuilder sb = new StringBuilder();
    for (DeepLearningParameters.MissingValuesHandling mvh : new DeepLearningParameters.MissingValuesHandling[] { DeepLearningParameters.MissingValuesHandling.MeanImputation, DeepLearningParameters.MissingValuesHandling.Skip }) {
        double sumloss = 0;
        Map<Double, Double> map = new TreeMap<>();
        for (double missing_fraction : new double[] { 0, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99 }) {
            double loss = 0;
            try {
                Scope.enter();
                NFSFileVec nfs = NFSFileVec.make("smalldata/junit/weather.csv");
                data = ParseDataset.parse(Key.make("data.hex"), nfs._key);
                Log.info("FrameSplitting");
                // Create holdout test data on clean data (before adding missing values)
                FrameSplitter fs = new FrameSplitter(data, new double[] { 0.75f }, generateNumKeys(data._key, 2), null);
                //.join();
                H2O.submitTask(fs);
                Frame[] train_test = fs.getResult();
                train = train_test[0];
                test = train_test[1];
                Log.info("Done...");
                // add missing values to the training data (excluding the response)
                if (missing_fraction > 0) {
                    Frame frtmp = new Frame(Key.<Frame>make(), train.names(), train.vecs());
                    //exclude the response
                    frtmp.remove(frtmp.numCols() - 1);
                    //need to put the frame (to be modified) into DKV for MissingInserter to pick up
                    DKV.put(frtmp._key, frtmp);
                    FrameUtils.MissingInserter j = new FrameUtils.MissingInserter(frtmp._key, seed, missing_fraction);
                    //MissingInserter is non-blocking, must block here explicitly
                    j.execImpl().get();
                    //Delete the frame header (not the data)
                    DKV.remove(frtmp._key);
                }
                // Build a regularized DL model with polluted training data, score on clean validation set
                p = new DeepLearningParameters();
                p._train = train._key;
                p._valid = test._key;
                p._response_column = train._names[train.numCols() - 1];
                //only for weather data
                p._ignored_columns = new String[] { train._names[1], train._names[22] };
                p._missing_values_handling = mvh;
                // DeepLearningParameters.Loss.ModifiedHuber;
                p._loss = DeepLearningParameters.Loss.CrossEntropy;
                p._activation = DeepLearningParameters.Activation.Rectifier;
                p._hidden = new int[] { 50, 50 };
                p._l1 = 1e-5;
                p._input_dropout_ratio = 0.2;
                p._epochs = 3;
                p._reproducible = true;
                p._seed = seed;
                p._elastic_averaging = false;
                // Convert response to categorical
                int ri = train.numCols() - 1;
                int ci = test.find(p._response_column);
                Scope.track(train.replace(ri, train.vecs()[ri].toCategoricalVec()));
                Scope.track(test.replace(ci, test.vecs()[ci].toCategoricalVec()));
                DKV.put(train);
                DKV.put(test);
                DeepLearning dl = new DeepLearning(p);
                Log.info("Starting with " + missing_fraction * 100 + "% missing values added.");
                mymodel = dl.trainModel().get();
                // Extract the scoring on validation set from the model
                loss = mymodel.loss();
                Log.info("Missing " + missing_fraction * 100 + "% -> logloss: " + loss);
            } catch (Throwable t) {
                t.printStackTrace();
                loss = 100;
            } finally {
                Scope.exit();
                // cleanup
                if (mymodel != null) {
                    mymodel.delete();
                }
                if (train != null)
                    train.delete();
                if (test != null)
                    test.delete();
                if (data != null)
                    data.delete();
            }
            map.put(missing_fraction, loss);
            sumloss += loss;
        }
        sb.append("\nMethod: ").append(mvh.toString()).append("\n");
        sb.append("missing fraction --> loss\n");
        for (String s : Arrays.toString(map.entrySet().toArray()).split(",")) sb.append(s.replace("=", " --> ")).append("\n");
        sb.append('\n');
        sb.append("sum loss: ").append(sumloss).append("\n");
        sumErr.put(mvh, sumloss);
    }
    Log.info(sb.toString());
    Assert.assertEquals(405.5017, sumErr.get(DeepLearningParameters.MissingValuesHandling.Skip), 1e-2);
    Assert.assertEquals(3.914915, sumErr.get(DeepLearningParameters.MissingValuesHandling.MeanImputation), 1e-3);
}

Also used : FrameUtils(water.util.FrameUtils) Frame(water.fvec.Frame) NFSFileVec(water.fvec.NFSFileVec) DeepLearningParameters(hex.deeplearning.DeepLearningModel.DeepLearningParameters) FrameSplitter(hex.FrameSplitter) Test(org.junit.Test)

Example 4 with FrameSplitter

use of hex.FrameSplitter in project h2o-3 by h2oai.

the class DeepWaterAbstractIntegrationTest method testCheckpointOverwriteWithBestModel2.

// Check that the restarted model honors the previous model as a best model so far
@Test
public void testCheckpointOverwriteWithBestModel2() {
    Frame tfr = null;
    DeepWaterModel dl = null;
    DeepWaterModel dl2 = null;
    Frame train = null, valid = null;
    try {
        tfr = parse_test_file("./smalldata/iris/iris.csv");
        FrameSplitter fs = new FrameSplitter(tfr, new double[] { 0.8 }, new Key[] { Key.make("train"), Key.make("valid") }, null);
        fs.compute2();
        train = fs.getResult()[0];
        valid = fs.getResult()[1];
        DeepWaterParameters parms = new DeepWaterParameters();
        parms._backend = getBackend();
        parms._train = train._key;
        parms._valid = valid._key;
        parms._epochs = 10;
        parms._response_column = "C5";
        parms._hidden = new int[] { 50, 50 };
        parms._seed = 0xdecaf;
        parms._train_samples_per_iteration = 0;
        parms._score_duty_cycle = 1;
        parms._score_interval = 0;
        parms._stopping_rounds = 0;
        parms._overwrite_with_best_model = true;
        dl = new DeepWater(parms).trainModel().get();
        double ll1 = ((ModelMetricsMultinomial) dl._output._validation_metrics).logloss();
        DeepWaterParameters parms2 = (DeepWaterParameters) parms.clone();
        parms2._epochs = 20;
        parms2._checkpoint = dl._key;
        dl2 = new DeepWater(parms2).trainModel().get();
        double ll2 = ((ModelMetricsMultinomial) dl2._output._validation_metrics).logloss();
        Assert.assertTrue(ll2 <= ll1);
    } finally {
        if (tfr != null)
            tfr.delete();
        if (dl != null)
            dl.delete();
        if (dl2 != null)
            dl2.delete();
        if (train != null)
            train.delete();
        if (valid != null)
            valid.delete();
    }
}

Also used : Frame(water.fvec.Frame) ShuffleSplitFrame(hex.splitframe.ShuffleSplitFrame) FrameSplitter(hex.FrameSplitter) ModelMetricsMultinomial(hex.ModelMetricsMultinomial)

Example 5 with FrameSplitter

use of hex.FrameSplitter in project h2o-2 by h2oai.

the class FrameSplitPage method execImpl.

// Run the function
@Override
protected void execImpl() {
    Frame frame = source;
    if (shuffle) {
        // FIXME: switch to global shuffle
        frame = MRUtils.shuffleFramePerChunk(Utils.generateShuffledKey(frame._key), frame, seed);
        // save frame to DKV
        frame.delete_and_lock(null).unlock(null);
        // delete frame on the end
        gtrash(frame);
    }
    FrameSplitter fs = new FrameSplitter(frame, ratios);
    H2O.submitTask(fs);
    Frame[] splits = fs.getResult();
    split_keys = new Key[splits.length];
    split_rows = new long[splits.length];
    float rsum = Utils.sum(ratios);
    split_ratios = Arrays.copyOf(ratios, splits.length);
    split_ratios[splits.length - 1] = 1f - rsum;
    long sum = 0;
    for (int i = 0; i < splits.length; i++) {
        sum += splits[i].numRows();
        split_keys[i] = splits[i]._key;
        split_rows[i] = splits[i].numRows();
    }
    assert sum == source.numRows() : "Frame split produced wrong number of rows: nrows(source) != sum(nrows(splits))";
}

Also used : Frame(water.fvec.Frame) FrameSplitter(hex.FrameSplitter)

Aggregations

FrameSplitter (hex.FrameSplitter)6 Frame (water.fvec.Frame)6 Test (org.junit.Test)3 ModelMetricsMultinomial (hex.ModelMetricsMultinomial)2 ShuffleSplitFrame (hex.splitframe.ShuffleSplitFrame)2 NFSFileVec (water.fvec.NFSFileVec)2 FrameUtils (water.util.FrameUtils)2 DeepLearningParameters (hex.deeplearning.DeepLearningModel.DeepLearningParameters)1 File (java.io.File)1 TreeMap (java.util.TreeMap)1 Key (water.Key)1