use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.
the class GBMTest method testReprodubilityAirlineSingleNode.
@Test
public void testReprodubilityAirlineSingleNode() {
Frame tfr = null;
final int N = 10;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// DKV.put(tfr);
for (String s : new String[] { "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i = 0; i < N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 500;
parms._ntrees = 7;
parms._max_depth = 5;
parms._min_rows = 10;
parms._distribution = DistributionFamily.bernoulli;
parms._balance_classes = true;
parms._seed = 0;
parms._build_tree_one_node = true;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length - 1]._mse;
gbm.delete();
}
} finally {
if (tfr != null)
tfr.remove();
}
Scope.exit();
System.out.println("MSE");
for (double d : mses) System.out.println(d);
for (double mse : mses) //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
assertEquals(0.21694215729861027, mse, 1e-8);
}
use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.
the class AggregatorTest method testChunks.
@Test
public void testChunks() {
Frame frame = parse_test_file("smalldata/covtype/covtype.20k.data");
AggregatorModel.AggregatorParameters parms = new AggregatorModel.AggregatorParameters();
parms._train = frame._key;
parms._target_num_exemplars = 137;
parms._rel_tol_num_exemplars = 0.05;
long start = System.currentTimeMillis();
// 0.418
AggregatorModel agg = new Aggregator(parms).trainModel().get();
System.out.println("AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds");
agg.checkConsistency();
Frame output = agg._output._output_frame.get();
checkNumExemplars(agg);
output.remove();
agg.remove();
for (int i : new int[] { 1, 2, 5, 10, 50, 100 }) {
Key key = Key.make();
RebalanceDataSet rb = new RebalanceDataSet(frame, key, i);
H2O.submitTask(rb);
rb.join();
Frame rebalanced = DKV.get(key).get();
parms = new AggregatorModel.AggregatorParameters();
parms._train = frame._key;
parms._target_num_exemplars = 137;
parms._rel_tol_num_exemplars = 0.05;
start = System.currentTimeMillis();
// 0.373 0.504 0.357 0.454 0.368 0.355
AggregatorModel agg2 = new Aggregator(parms).trainModel().get();
System.out.println("AggregatorModel finished in: " + (System.currentTimeMillis() - start) / 1000. + " seconds");
agg2.checkConsistency();
Log.info("Number of exemplars for " + i + " chunks: " + agg2._exemplars.length);
rebalanced.delete();
Assert.assertTrue(Math.abs(agg._exemplars.length - agg2._exemplars.length) == 0);
output = agg2._output._output_frame.get();
output.remove();
checkNumExemplars(agg);
agg2.remove();
}
frame.delete();
}
use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.
the class DRFTest method testReproducibilityAirline.
// PUBDEV-557 Test dependency on # nodes (for small number of bins, but fixed number of chunks)
@Test
public void testReproducibilityAirline() {
Frame tfr = null;
final int N = 1;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// DKV.put(tfr);
for (String s : new String[] { "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i = 0; i < N; ++i) {
DRFModel.DRFParameters parms = new DRFModel.DRFParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 1024;
parms._ntrees = 7;
parms._max_depth = 10;
parms._binomial_double_trees = false;
parms._mtries = -1;
parms._min_rows = 1;
// Simulated sampling with replacement
parms._sample_rate = 0.632f;
parms._balance_classes = true;
parms._seed = (1L << 32) | 2;
// Build a first model; all remaining models should be equal
DRFModel drf = new DRF(parms).trainModel().get();
assertEquals(drf._output._ntrees, parms._ntrees);
mses[i] = drf._output._training_metrics.mse();
drf.delete();
}
} finally {
if (tfr != null)
tfr.remove();
}
Scope.exit();
for (int i = 0; i < mses.length; ++i) {
Log.info("trial: " + i + " -> MSE: " + mses[i]);
}
for (int i = 0; i < mses.length; ++i) {
//check for the same result on 1 nodes and 5 nodes
assertEquals(0.20377446328850304, mses[i], 1e-4);
}
}
use of water.fvec.RebalanceDataSet in project h2o-3 by h2oai.
the class DRFTest method testReproducibility.
//
@Test
public void testReproducibility() {
Frame tfr = null;
final int N = 5;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
// rebalance to 256 chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
for (int i = 0; i < N; ++i) {
DRFModel.DRFParameters parms = new DRFModel.DRFParameters();
parms._train = tfr._key;
parms._response_column = "C55";
parms._nbins = 1000;
parms._ntrees = 1;
parms._max_depth = 8;
parms._mtries = -1;
parms._min_rows = 10;
parms._seed = 1234;
// Build a first model; all remaining models should be equal
DRFModel drf = new DRF(parms).trainModel().get();
assertEquals(drf._output._ntrees, parms._ntrees);
mses[i] = drf._output._scored_train[drf._output._scored_train.length - 1]._mse;
drf.delete();
}
} finally {
if (tfr != null)
tfr.remove();
}
Scope.exit();
for (int i = 0; i < mses.length; ++i) {
Log.info("trial: " + i + " -> MSE: " + mses[i]);
}
for (double mse : mses) assertEquals(mse, mses[0], 1e-15);
}
use of water.fvec.RebalanceDataSet in project h2o-2 by h2oai.
the class DeepLearning method reBalance.
/**
* Rebalance a frame for load balancing
* @param fr Input frame
* @param local whether to only create enough chunks to max out all cores on one node only
* @return Frame that has potentially more chunks
*/
private Frame reBalance(final Frame fr, boolean local) {
int chunks = (int) Math.min(4 * H2O.NUMCPUS * (local ? 1 : H2O.CLOUD.size()), fr.numRows());
if (fr.anyVec().nChunks() > chunks && !reproducible) {
Log.info("Dataset already contains " + fr.anyVec().nChunks() + " chunks. No need to rebalance.");
return fr;
} else if (reproducible) {
Log.warn("Reproducibility enforced - using only 1 thread - can be slow.");
chunks = 1;
}
if (!quiet_mode)
Log.info("ReBalancing dataset into (at least) " + chunks + " chunks.");
// return MRUtils.shuffleAndBalance(fr, chunks, seed, local, shuffle_training_data);
String snewKey = fr._key != null ? (fr._key.toString() + ".balanced") : Key.rand();
Key newKey = Key.makeSystem(snewKey);
RebalanceDataSet rb = new RebalanceDataSet(fr, newKey, chunks);
H2O.submitTask(rb);
rb.join();
return UKV.get(newKey);
}
Aggregations