use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.
the class DRFTest method testChunks.
// PUBDEV-2476 Check reproducibility for the same # of chunks (i.e., same # of nodes) and same parameters
@Test
public void testChunks() {
Frame tfr;
final int N = 4;
double[] mses = new double[N];
int[] chunks = new int[] { 1, 13, 19, 39, 500 };
for (int i = 0; i < N; ++i) {
Scope.enter();
// Load data, hack frames
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
// rebalance to 256 chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, chunks[i]);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec()));
DKV.put(tfr);
DRFModel.DRFParameters parms = new DRFModel.DRFParameters();
parms._train = tfr._key;
parms._response_column = "C55";
parms._ntrees = 10;
parms._seed = 1234;
parms._auto_rebalance = false;
// Build a first model; all remaining models should be equal
DRF job = new DRF(parms);
DRFModel drf = job.trainModel().get();
assertEquals(drf._output._ntrees, parms._ntrees);
mses[i] = drf._output._scored_train[drf._output._scored_train.length - 1]._mse;
drf.delete();
if (tfr != null)
tfr.remove();
Scope.exit();
}
for (int i = 0; i < mses.length; ++i) {
Log.info("trial: " + i + " -> MSE: " + mses[i]);
}
for (double mse : mses) assertEquals(mse, mses[0], 1e-10);
}
use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.
the class GBMTest method testReprodubilityAirline.
// PUBDEV-557: Test dependency on # nodes (for small number of bins, but fixed number of chunks)
@Test
public void testReprodubilityAirline() {
Frame tfr = null;
final int N = 5;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// DKV.put(tfr);
for (String s : new String[] { "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i = 0; i < N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 500;
parms._ntrees = 7;
parms._max_depth = 5;
parms._min_rows = 10;
parms._distribution = DistributionFamily.bernoulli;
parms._balance_classes = true;
parms._seed = 0;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length - 1]._mse;
gbm.delete();
}
} finally {
if (tfr != null)
tfr.remove();
}
Scope.exit();
System.out.println("MSEs start");
for (double d : mses) System.out.println(d);
System.out.println("MSEs End");
System.out.flush();
for (double mse : mses) //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks), mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
assertEquals(0.21694215729861027, mse, 1e-8);
}
use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.
the class GBMTest method testReprodubility.
// HEXDEV-194: Check reproducibility for the same # of chunks (i.e., same # of nodes) and same parameters
@Test
public void testReprodubility() {
Frame tfr = null;
final int N = 5;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
// rebalance to 256 chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
for (int i = 0; i < N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "C55";
parms._nbins = 1000;
parms._ntrees = 5;
parms._max_depth = 8;
parms._learn_rate = 0.1f;
parms._min_rows = 10;
// parms._distribution = Family.multinomial;
parms._distribution = gaussian;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length - 1]._mse;
gbm.delete();
}
} finally {
if (tfr != null)
tfr.remove();
}
Scope.exit();
for (double mse : mses) System.out.println(mse);
for (double mse : mses) assertEquals(mse, mses[0], 1e-15);
}
use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.
the class GBMTest method testChunks.
// PUBDEV-2476 Check reproducibility for the same # of chunks (i.e., same # of nodes) and same parameters
@Test
public void testChunks() {
Frame tfr;
int[] chunks = new int[] { 1, 2, 2, 39, 39, 500 };
final int N = chunks.length;
double[] mses = new double[N];
for (int i = 0; i < N; ++i) {
Scope.enter();
// Load data, hack frames
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
// rebalance to a given number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, chunks[i]);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
assertEquals(tfr.vec(0).nChunks(), chunks[i]);
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "C55";
parms._seed = 1234;
parms._auto_rebalance = false;
parms._col_sample_rate_per_tree = 0.5f;
parms._col_sample_rate = 0.3f;
parms._ntrees = 5;
parms._max_depth = 5;
// Build a first model; all remaining models should be equal
GBM job = new GBM(parms);
GBMModel drf = job.trainModel().get();
assertEquals(drf._output._ntrees, parms._ntrees);
mses[i] = drf._output._scored_train[drf._output._scored_train.length - 1]._mse;
drf.delete();
if (tfr != null)
tfr.remove();
Scope.exit();
}
for (int i = 0; i < mses.length; ++i) {
Log.info("trial: " + i + " -> MSE: " + mses[i]);
}
for (double mse : mses) assertEquals(mse, mses[0], 1e-10);
}
use of water.fvec.RebalanceDataSet in project h2o-2 by h2oai.
the class MatrixTest method testMultiplication.
// simple small & dense, compare t(X) %*% X against gram computed by glm task.
@Test
public void testMultiplication() {
Key parsed = Key.make("prostate_parsed");
Futures fs = new Futures();
Frame fr = getFrameForFile(parsed, "smalldata/glm_test/prostate_cat_replaced.csv");
fr.remove("RACE").remove(fs);
Key k = Key.make("rebalanced");
H2O.submitTask(new RebalanceDataSet(fr, k, 64)).join();
fr.delete();
fr = DKV.get(k).get();
Frame tr = DMatrix.transpose(fr);
tr.reloadVecs();
Frame z = DMatrix.mmul(tr, fr);
DataInfo dinfo = new DataInfo(fr, 0, false, false, DataInfo.TransformType.NONE);
GramTask gt = new GramTask(null, dinfo, false, false).doAll(dinfo._adaptedFrame);
gt._gram.mul(gt._nobs);
double[][] gram = gt._gram.getDenseXX();
for (int i = 0; i < gram.length; ++i) for (int j = 0; j < gram[i].length; ++j) assertEquals("position " + i + ", " + j, gram[i][j], z.vec(j).at(i), 1e-4);
fr.delete();
for (Vec v : tr.vecs()) v.remove(fs);
for (Vec v : z.vecs()) v.remove(fs);
// for(Vec v:z2.vecs())
// v.remove(fs);
fs.blockForPending();
checkLeakedKeys();
}
Aggregations