Search in sources :

Example 11 with RebalanceDataSet

use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.

the class DRFTest method testChunks.

// PUBDEV-2476 Check reproducibility for the same # of chunks (i.e., same # of nodes) and same parameters
@Test
public void testChunks() {
    Frame tfr;
    final int N = 4;
    double[] mses = new double[N];
    int[] chunks = new int[] { 1, 13, 19, 39, 500 };
    for (int i = 0; i < N; ++i) {
        Scope.enter();
        // Load data, hack frames
        tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
        // rebalance to 256 chunks
        Key dest = Key.make("df.rebalanced.hex");
        RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, chunks[i]);
        H2O.submitTask(rb);
        rb.join();
        tfr.delete();
        tfr = DKV.get(dest).get();
        Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec()));
        DKV.put(tfr);
        DRFModel.DRFParameters parms = new DRFModel.DRFParameters();
        parms._train = tfr._key;
        parms._response_column = "C55";
        parms._ntrees = 10;
        parms._seed = 1234;
        parms._auto_rebalance = false;
        // Build a first model; all remaining models should be equal
        DRF job = new DRF(parms);
        DRFModel drf = job.trainModel().get();
        assertEquals(drf._output._ntrees, parms._ntrees);
        mses[i] = drf._output._scored_train[drf._output._scored_train.length - 1]._mse;
        drf.delete();
        if (tfr != null)
            tfr.remove();
        Scope.exit();
    }
    for (int i = 0; i < mses.length; ++i) {
        Log.info("trial: " + i + " -> MSE: " + mses[i]);
    }
    for (double mse : mses) assertEquals(mse, mses[0], 1e-10);
}
Also used : Frame(water.fvec.Frame) SplitFrame(hex.SplitFrame) RebalanceDataSet(water.fvec.RebalanceDataSet) Test(org.junit.Test)

Example 12 with RebalanceDataSet

use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.

the class GBMTest method testReprodubilityAirline.

// PUBDEV-557: Test dependency on # nodes (for small number of bins, but fixed number of chunks)
@Test
public void testReprodubilityAirline() {
    Frame tfr = null;
    final int N = 5;
    double[] mses = new double[N];
    Scope.enter();
    try {
        // Load data, hack frames
        tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
        // rebalance to fixed number of chunks
        Key dest = Key.make("df.rebalanced.hex");
        RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
        H2O.submitTask(rb);
        rb.join();
        tfr.delete();
        tfr = DKV.get(dest).get();
        //      DKV.put(tfr);
        for (String s : new String[] { "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) {
            tfr.remove(s).remove();
        }
        DKV.put(tfr);
        for (int i = 0; i < N; ++i) {
            GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
            parms._train = tfr._key;
            parms._response_column = "IsDepDelayed";
            parms._nbins = 10;
            parms._nbins_cats = 500;
            parms._ntrees = 7;
            parms._max_depth = 5;
            parms._min_rows = 10;
            parms._distribution = DistributionFamily.bernoulli;
            parms._balance_classes = true;
            parms._seed = 0;
            // Build a first model; all remaining models should be equal
            GBMModel gbm = new GBM(parms).trainModel().get();
            assertEquals(gbm._output._ntrees, parms._ntrees);
            mses[i] = gbm._output._scored_train[gbm._output._scored_train.length - 1]._mse;
            gbm.delete();
        }
    } finally {
        if (tfr != null)
            tfr.remove();
    }
    Scope.exit();
    System.out.println("MSEs start");
    for (double d : mses) System.out.println(d);
    System.out.println("MSEs End");
    System.out.flush();
    for (double mse : mses) //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks), mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
    assertEquals(0.21694215729861027, mse, 1e-8);
}
Also used : Frame(water.fvec.Frame) RebalanceDataSet(water.fvec.RebalanceDataSet) Test(org.junit.Test)

Example 13 with RebalanceDataSet

use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.

the class GBMTest method testReprodubility.

// HEXDEV-194: Check reproducibility for the same # of chunks (i.e., same # of nodes) and same parameters
@Test
public void testReprodubility() {
    Frame tfr = null;
    final int N = 5;
    double[] mses = new double[N];
    Scope.enter();
    try {
        // Load data, hack frames
        tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
        // rebalance to 256 chunks
        Key dest = Key.make("df.rebalanced.hex");
        RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
        H2O.submitTask(rb);
        rb.join();
        tfr.delete();
        tfr = DKV.get(dest).get();
        for (int i = 0; i < N; ++i) {
            GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
            parms._train = tfr._key;
            parms._response_column = "C55";
            parms._nbins = 1000;
            parms._ntrees = 5;
            parms._max_depth = 8;
            parms._learn_rate = 0.1f;
            parms._min_rows = 10;
            //        parms._distribution = Family.multinomial;
            parms._distribution = gaussian;
            // Build a first model; all remaining models should be equal
            GBMModel gbm = new GBM(parms).trainModel().get();
            assertEquals(gbm._output._ntrees, parms._ntrees);
            mses[i] = gbm._output._scored_train[gbm._output._scored_train.length - 1]._mse;
            gbm.delete();
        }
    } finally {
        if (tfr != null)
            tfr.remove();
    }
    Scope.exit();
    for (double mse : mses) System.out.println(mse);
    for (double mse : mses) assertEquals(mse, mses[0], 1e-15);
}
Also used : Frame(water.fvec.Frame) RebalanceDataSet(water.fvec.RebalanceDataSet) Test(org.junit.Test)

Example 14 with RebalanceDataSet

use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.

the class GBMTest method testChunks.

// PUBDEV-2476 Check reproducibility for the same # of chunks (i.e., same # of nodes) and same parameters
@Test
public void testChunks() {
    Frame tfr;
    int[] chunks = new int[] { 1, 2, 2, 39, 39, 500 };
    final int N = chunks.length;
    double[] mses = new double[N];
    for (int i = 0; i < N; ++i) {
        Scope.enter();
        // Load data, hack frames
        tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
        // rebalance to a given number of chunks
        Key dest = Key.make("df.rebalanced.hex");
        RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, chunks[i]);
        H2O.submitTask(rb);
        rb.join();
        tfr.delete();
        tfr = DKV.get(dest).get();
        assertEquals(tfr.vec(0).nChunks(), chunks[i]);
        //      Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
        DKV.put(tfr);
        GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
        parms._train = tfr._key;
        parms._response_column = "C55";
        parms._seed = 1234;
        parms._auto_rebalance = false;
        parms._col_sample_rate_per_tree = 0.5f;
        parms._col_sample_rate = 0.3f;
        parms._ntrees = 5;
        parms._max_depth = 5;
        // Build a first model; all remaining models should be equal
        GBM job = new GBM(parms);
        GBMModel drf = job.trainModel().get();
        assertEquals(drf._output._ntrees, parms._ntrees);
        mses[i] = drf._output._scored_train[drf._output._scored_train.length - 1]._mse;
        drf.delete();
        if (tfr != null)
            tfr.remove();
        Scope.exit();
    }
    for (int i = 0; i < mses.length; ++i) {
        Log.info("trial: " + i + " -> MSE: " + mses[i]);
    }
    for (double mse : mses) assertEquals(mse, mses[0], 1e-10);
}
Also used : Frame(water.fvec.Frame) RebalanceDataSet(water.fvec.RebalanceDataSet) Test(org.junit.Test)

Example 15 with RebalanceDataSet

use of water.fvec.RebalanceDataSet in project h2o-2 by h2oai.

the class MatrixTest method testMultiplication.

// simple small & dense, compare t(X) %*% X against gram computed by glm task.
@Test
public void testMultiplication() {
    Key parsed = Key.make("prostate_parsed");
    Futures fs = new Futures();
    Frame fr = getFrameForFile(parsed, "smalldata/glm_test/prostate_cat_replaced.csv");
    fr.remove("RACE").remove(fs);
    Key k = Key.make("rebalanced");
    H2O.submitTask(new RebalanceDataSet(fr, k, 64)).join();
    fr.delete();
    fr = DKV.get(k).get();
    Frame tr = DMatrix.transpose(fr);
    tr.reloadVecs();
    Frame z = DMatrix.mmul(tr, fr);
    DataInfo dinfo = new DataInfo(fr, 0, false, false, DataInfo.TransformType.NONE);
    GramTask gt = new GramTask(null, dinfo, false, false).doAll(dinfo._adaptedFrame);
    gt._gram.mul(gt._nobs);
    double[][] gram = gt._gram.getDenseXX();
    for (int i = 0; i < gram.length; ++i) for (int j = 0; j < gram[i].length; ++j) assertEquals("position " + i + ", " + j, gram[i][j], z.vec(j).at(i), 1e-4);
    fr.delete();
    for (Vec v : tr.vecs()) v.remove(fs);
    for (Vec v : z.vecs()) v.remove(fs);
    //    for(Vec v:z2.vecs())
    //      v.remove(fs);
    fs.blockForPending();
    checkLeakedKeys();
}
Also used : DataInfo(hex.FrameTask.DataInfo) Frame(water.fvec.Frame) RebalanceDataSet(water.fvec.RebalanceDataSet) NFSFileVec(water.fvec.NFSFileVec) GramTask(hex.gram.Gram.GramTask) Test(org.junit.Test)

Aggregations

RebalanceDataSet (water.fvec.RebalanceDataSet)16 Frame (water.fvec.Frame)14 Test (org.junit.Test)11 SplitFrame (hex.SplitFrame)3 NFSFileVec (water.fvec.NFSFileVec)3 CreateFrame (hex.CreateFrame)2 DataInfo (hex.FrameTask.DataInfo)1 Aggregator (hex.aggregator.Aggregator)1 AggregatorModel (hex.aggregator.AggregatorModel)1 DeepLearningParameters (hex.deeplearning.DeepLearningModel.DeepLearningParameters)1 DRFModel (hex.drf.DRF.DRFModel)1 GLMModel (hex.glm.GLMModel)1 GramTask (hex.gram.Gram.GramTask)1 File (java.io.File)1 Key (water.Key)1 MRUtils.sampleFrame (water.util.MRUtils.sampleFrame)1 PrettyPrint (water.util.PrettyPrint)1