Search in sources :

Example 1 with RebalanceDataSet

use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.

the class GBMTest method testReprodubilityAirlineSingleNode.

@Test
public void testReprodubilityAirlineSingleNode() {
    Frame tfr = null;
    final int N = 10;
    double[] mses = new double[N];
    Scope.enter();
    try {
        // Load data, hack frames
        tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
        // rebalance to fixed number of chunks
        Key dest = Key.make("df.rebalanced.hex");
        RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
        H2O.submitTask(rb);
        rb.join();
        tfr.delete();
        tfr = DKV.get(dest).get();
        //      DKV.put(tfr);
        for (String s : new String[] { "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) {
            tfr.remove(s).remove();
        }
        DKV.put(tfr);
        for (int i = 0; i < N; ++i) {
            GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
            parms._train = tfr._key;
            parms._response_column = "IsDepDelayed";
            parms._nbins = 10;
            parms._nbins_cats = 500;
            parms._ntrees = 7;
            parms._max_depth = 5;
            parms._min_rows = 10;
            parms._distribution = DistributionFamily.bernoulli;
            parms._balance_classes = true;
            parms._seed = 0;
            parms._build_tree_one_node = true;
            // Build a first model; all remaining models should be equal
            GBMModel gbm = new GBM(parms).trainModel().get();
            assertEquals(gbm._output._ntrees, parms._ntrees);
            mses[i] = gbm._output._scored_train[gbm._output._scored_train.length - 1]._mse;
            gbm.delete();
        }
    } finally {
        if (tfr != null)
            tfr.remove();
    }
    Scope.exit();
    System.out.println("MSE");
    for (double d : mses) System.out.println(d);
    for (double mse : mses) //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
    assertEquals(0.21694215729861027, mse, 1e-8);
}
Also used : Frame(water.fvec.Frame) RebalanceDataSet(water.fvec.RebalanceDataSet) Test(org.junit.Test)

Example 2 with RebalanceDataSet

use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.

the class AggregatorTest method testChunks.

@Test
public void testChunks() {
    Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data");
    AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
    parms._train = frame._key;
    parms._target_num_exemplars = 137;
    parms._rel_tol_num_exemplars = 0.05;
    long start = System.currentTimeMillis();
    // 0.418
    AggregatorModel agg = new Aggregator(parms).trainModel().get();
    System.out.println("AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds");
    agg.checkConsistency();
    Frame output = agg._output._output_frame.get();
    checkNumExemplars(agg);
    output.remove();
    agg.remove();
    for (int i : new int[] { 1, 2, 5, 10, 50, 100 }) {
        Key key = Key.make();
        RebalanceDataSet rb = new RebalanceDataSet(frame, key, i);
        H2O.submitTask(rb);
        rb.join();
        Frame rebalanced = DKV.get(key).get();
        parms = new AggregatorModel.AggregatorParameters();
        parms._train = frame._key;
        parms._target_num_exemplars = 137;
        parms._rel_tol_num_exemplars = 0.05;
        start = System.currentTimeMillis();
        // 0.373 0.504 0.357 0.454 0.368 0.355
        AggregatorModel agg2 = new Aggregator(parms).trainModel().get();
        System.out.println("AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds");
        agg2.checkConsistency();
        Log.info("Number of exemplars for " + i + " chunks: " + agg2._exemplars.length);
        rebalanced.delete();
        Assert.assertTrue(Math.abs(agg._exemplars.length - agg2._exemplars.length) == 0);
        output = agg2._output._output_frame.get();
        output.remove();
        checkNumExemplars(agg);
        agg2.remove();
    }
    frame.delete();
}
Also used : CreateFrame(hex.CreateFrame) Frame(water.fvec.Frame) RebalanceDataSet(water.fvec.RebalanceDataSet) Aggregator(hex.aggregator.Aggregator) AggregatorModel(hex.aggregator.AggregatorModel) Test(org.junit.Test)

Example 3 with RebalanceDataSet

use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.

the class DRFTest method testReproducibilityAirline.

// PUBDEV-557 Test dependency on # nodes (for small number of bins, but fixed number of chunks)
@Test
public void testReproducibilityAirline() {
    Frame tfr = null;
    final int N = 1;
    double[] mses = new double[N];
    Scope.enter();
    try {
        // Load data, hack frames
        tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
        // rebalance to fixed number of chunks
        Key dest = Key.make("df.rebalanced.hex");
        RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
        H2O.submitTask(rb);
        rb.join();
        tfr.delete();
        tfr = DKV.get(dest).get();
        //      DKV.put(tfr);
        for (String s : new String[] { "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) {
            tfr.remove(s).remove();
        }
        DKV.put(tfr);
        for (int i = 0; i < N; ++i) {
            DRFModel.DRFParameters parms = new DRFModel.DRFParameters();
            parms._train = tfr._key;
            parms._response_column = "IsDepDelayed";
            parms._nbins = 10;
            parms._nbins_cats = 1024;
            parms._ntrees = 7;
            parms._max_depth = 10;
            parms._binomial_double_trees = false;
            parms._mtries = -1;
            parms._min_rows = 1;
            // Simulated sampling with replacement
            parms._sample_rate = 0.632f;
            parms._balance_classes = true;
            parms._seed = (1L << 32) | 2;
            // Build a first model; all remaining models should be equal
            DRFModel drf = new DRF(parms).trainModel().get();
            assertEquals(drf._output._ntrees, parms._ntrees);
            mses[i] = drf._output._training_metrics.mse();
            drf.delete();
        }
    } finally {
        if (tfr != null)
            tfr.remove();
    }
    Scope.exit();
    for (int i = 0; i < mses.length; ++i) {
        Log.info("trial: " + i + " -> MSE: " + mses[i]);
    }
    for (int i = 0; i < mses.length; ++i) {
        //check for the same result on 1 nodes and 5 nodes
        assertEquals(0.20377446328850304, mses[i], 1e-4);
    }
}
Also used : Frame(water.fvec.Frame) SplitFrame(hex.SplitFrame) RebalanceDataSet(water.fvec.RebalanceDataSet) Test(org.junit.Test)

Example 4 with RebalanceDataSet

use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.

the class DRFTest method testReproducibility.

//
@Test
public void testReproducibility() {
    Frame tfr = null;
    final int N = 5;
    double[] mses = new double[N];
    Scope.enter();
    try {
        // Load data, hack frames
        tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
        // rebalance to 256 chunks
        Key dest = Key.make("df.rebalanced.hex");
        RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
        H2O.submitTask(rb);
        rb.join();
        tfr.delete();
        tfr = DKV.get(dest).get();
        for (int i = 0; i < N; ++i) {
            DRFModel.DRFParameters parms = new DRFModel.DRFParameters();
            parms._train = tfr._key;
            parms._response_column = "C55";
            parms._nbins = 1000;
            parms._ntrees = 1;
            parms._max_depth = 8;
            parms._mtries = -1;
            parms._min_rows = 10;
            parms._seed = 1234;
            // Build a first model; all remaining models should be equal
            DRFModel drf = new DRF(parms).trainModel().get();
            assertEquals(drf._output._ntrees, parms._ntrees);
            mses[i] = drf._output._scored_train[drf._output._scored_train.length - 1]._mse;
            drf.delete();
        }
    } finally {
        if (tfr != null)
            tfr.remove();
    }
    Scope.exit();
    for (int i = 0; i < mses.length; ++i) {
        Log.info("trial: " + i + " -> MSE: " + mses[i]);
    }
    for (double mse : mses) assertEquals(mse, mses[0], 1e-15);
}
Also used : Frame(water.fvec.Frame) SplitFrame(hex.SplitFrame) RebalanceDataSet(water.fvec.RebalanceDataSet) Test(org.junit.Test)

Example 5 with RebalanceDataSet

use of water.fvec.RebalanceDataSet in project h2o-2 by h2oai.

the class DeepLearning method reBalance.

/**
   * Rebalance a frame for load balancing
   * @param fr Input frame
   * @param local whether to only create enough chunks to max out all cores on one node only
   * @return Frame that has potentially more chunks
   */
private Frame reBalance(final Frame fr, boolean local) {
    int chunks = (int) Math.min(4 * H2O.NUMCPUS * (local ? 1 : H2O.CLOUD.size()), fr.numRows());
    if (fr.anyVec().nChunks() > chunks && !reproducible) {
        Log.info("Dataset already contains " + fr.anyVec().nChunks() + " chunks. No need to rebalance.");
        return fr;
    } else if (reproducible) {
        Log.warn("Reproducibility enforced - using only 1 thread - can be slow.");
        chunks = 1;
    }
    if (!quiet_mode)
        Log.info("ReBalancing dataset into (at least) " + chunks + " chunks.");
    //      return MRUtils.shuffleAndBalance(fr, chunks, seed, local, shuffle_training_data);
    String snewKey = fr._key != null ? (fr._key.toString() + ".balanced") : Key.rand();
    Key newKey = Key.makeSystem(snewKey);
    RebalanceDataSet rb = new RebalanceDataSet(fr, newKey, chunks);
    H2O.submitTask(rb);
    rb.join();
    return UKV.get(newKey);
}
Also used : RebalanceDataSet(water.fvec.RebalanceDataSet)

Aggregations

RebalanceDataSet (water.fvec.RebalanceDataSet)16 Frame (water.fvec.Frame)14 Test (org.junit.Test)11 SplitFrame (hex.SplitFrame)3 NFSFileVec (water.fvec.NFSFileVec)3 CreateFrame (hex.CreateFrame)2 DataInfo (hex.FrameTask.DataInfo)1 Aggregator (hex.aggregator.Aggregator)1 AggregatorModel (hex.aggregator.AggregatorModel)1 DeepLearningParameters (hex.deeplearning.DeepLearningModel.DeepLearningParameters)1 DRFModel (hex.drf.DRF.DRFModel)1 GLMModel (hex.glm.GLMModel)1 GramTask (hex.gram.Gram.GramTask)1 File (java.io.File)1 Key (water.Key)1 MRUtils.sampleFrame (water.util.MRUtils.sampleFrame)1 PrettyPrint (water.util.PrettyPrint)1