use of water.fvec.Frame in project h2o-3 by h2oai.
the class DMatrix method transpose.
/**
* Transpose the Frame as if it was a matrix (i.e. rows become coumns).
* Must be all numeric, currently will fail if there are too many rows ( >= ~.5M).
* Result will be put into a new Vectro Group and will be balanced so that each vec will have
* (4*num cpus in the cluster) chunks.
*
* @param src
* @return
*/
public static Frame transpose(Frame src) {
if (src.numRows() != (int) src.numRows())
throw H2O.unimpl();
int nchunks = Math.max(1, src.numCols() / 10000);
long[] espc = new long[nchunks + 1];
int rpc = (src.numCols() / nchunks);
int rem = (src.numCols() % nchunks);
Arrays.fill(espc, rpc);
for (int i = 0; i < rem; ++i) ++espc[i];
long sum = 0;
for (int i = 0; i < espc.length; ++i) {
long s = espc[i];
espc[i] = sum;
sum += s;
}
Key key = Vec.newKey();
int rowLayout = Vec.ESPC.rowLayout(key, espc);
return transpose(src, new Frame(new Vec(key, rowLayout).makeZeros((int) src.numRows())));
}
use of water.fvec.Frame in project h2o-3 by h2oai.
the class GainsLift method init.
private void init(Job job) throws IllegalArgumentException {
_labels = _labels.toCategoricalVec();
if (_labels == null || _preds == null)
throw new IllegalArgumentException("Missing actualLabels or predictedProbs!");
if (_labels.length() != _preds.length())
throw new IllegalArgumentException("Both arguments must have the same length (" + _labels.length() + "!=" + _preds.length() + ")!");
if (!_labels.isInt())
throw new IllegalArgumentException("Actual column must be integer class labels!");
if (_labels.cardinality() != -1 && _labels.cardinality() != 2)
throw new IllegalArgumentException("Actual column must contain binary class labels, but found cardinality " + _labels.cardinality() + "!");
if (_preds.isCategorical())
throw new IllegalArgumentException("Predicted probabilities cannot be class labels, expect probabilities.");
if (_weights != null && !_weights.isNumeric())
throw new IllegalArgumentException("Observation weights must be numeric.");
// The vectors are from different groups => align them, but properly delete it after computation
if (!_labels.group().equals(_preds.group())) {
_preds = _labels.align(_preds);
Scope.track(_preds);
if (_weights != null) {
_weights = _labels.align(_weights);
Scope.track(_weights);
}
}
boolean fast = false;
if (fast) {
// FAST VERSION: single-pass, only works with the specific pre-computed quantiles from rollupstats
assert (_groups == 10);
assert (Arrays.equals(Vec.PERCENTILES, // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15, 16
new double[] { 0.001, 0.01, 0.1, 0.2, 0.25, 0.3, 1.0 / 3.0, 0.4, 0.5, 0.6, 2.0 / 3.0, 0.7, 0.75, 0.8, 0.9, 0.99, 0.999 }));
//HACK: hardcoded quantiles for simplicity (0.9,0.8,...,0.1,0)
//might do a full pass over the Vec
double[] rq = _preds.pctiles();
_quantiles = new double[] { rq[14], rq[13], rq[11], rq[9], rq[8], rq[7], rq[5], rq[3], rq[2], 0 };
} else {
// ACCURATE VERSION: multi-pass
Frame fr = null;
QuantileModel qm = null;
try {
QuantileModel.QuantileParameters qp = new QuantileModel.QuantileParameters();
if (_weights == null) {
fr = new Frame(Key.<Frame>make(), new String[] { "predictions" }, new Vec[] { _preds });
} else {
fr = new Frame(Key.<Frame>make(), new String[] { "predictions", "weights" }, new Vec[] { _preds, _weights });
qp._weights_column = "weights";
}
DKV.put(fr);
qp._train = fr._key;
if (_groups > 0) {
qp._probs = new double[_groups];
for (int i = 0; i < _groups; ++i) {
// This is 0.9, 0.8, 0.7, 0.6, ..., 0.1, 0 for 10 groups
qp._probs[i] = (_groups - i - 1.) / _groups;
}
} else {
qp._probs = new double[] { 0.99, 0.98, 0.97, 0.96, 0.95, 0.9, 0.85, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0 };
}
qm = job != null && !job.isDone() ? new Quantile(qp, job).trainModelNested(null) : new Quantile(qp).trainModel().get();
_quantiles = qm._output._quantiles[0];
// find uniques (is there a more elegant way?)
TreeSet<Double> hs = new TreeSet<>();
for (double d : _quantiles) hs.add(d);
_quantiles = new double[hs.size()];
Iterator<Double> it = hs.descendingIterator();
int i = 0;
while (it.hasNext()) _quantiles[i++] = it.next();
} finally {
if (qm != null)
qm.remove();
if (fr != null)
DKV.remove(fr._key);
}
}
}
use of water.fvec.Frame in project h2o-3 by h2oai.
the class GBMGridTest method testDuplicatesCarsGrid.
//@Ignore("PUBDEV-1643")
@Test
public void testDuplicatesCarsGrid() {
Grid grid = null;
Frame fr = null;
Vec old = null;
try {
fr = parse_test_file("smalldata/junit/cars_20mpg.csv");
// Remove unique id
fr.remove("name").remove();
old = fr.remove("economy");
// response to last column
fr.add("economy", old);
DKV.put(fr);
// Setup random hyperparameter search space
HashMap<String, Object[]> hyperParms = new HashMap<String, Object[]>() {
{
put("_distribution", new DistributionFamily[] { DistributionFamily.gaussian });
put("_ntrees", new Integer[] { 5, 5 });
put("_max_depth", new Integer[] { 2, 2 });
put("_learn_rate", new Double[] { .1, .1 });
}
};
// Fire off a grid search
GBMModel.GBMParameters params = new GBMModel.GBMParameters();
params._train = fr._key;
params._response_column = "economy";
Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
grid = gs.get();
// Check that duplicate model have not been constructed
Model[] models = grid.getModels();
assertTrue("Number of returned models has to be > 0", models.length > 0);
// But all off them should be same
Key<Model> modelKey = models[0]._key;
for (Model m : models) {
assertTrue("Number of constructed models has to be equal to 1", modelKey == m._key);
}
} finally {
if (old != null) {
old.remove();
}
if (fr != null) {
fr.remove();
}
if (grid != null) {
grid.remove();
}
}
}
use of water.fvec.Frame in project h2o-3 by h2oai.
the class GBMMissingTest method run.
@Test
public void run() {
long seed = 1234;
GBMModel mymodel = null;
Frame train = null;
Frame test = null;
Frame data = null;
GBMModel.GBMParameters p;
Log.info("");
Log.info("STARTING.");
Log.info("Using seed " + seed);
StringBuilder sb = new StringBuilder();
double sumerr = 0;
Map<Double, Double> map = new TreeMap<>();
for (double missing_fraction : new double[] { 0, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99 }) {
double err = 0;
try {
Scope.enter();
NFSFileVec nfs = TestUtil.makeNfsFileVec("smalldata/junit/weather.csv");
data = ParseDataset.parse(Key.make("data.hex"), nfs._key);
Log.info("FrameSplitting");
// Create holdout test data on clean data (before adding missing values)
FrameSplitter fs = new FrameSplitter(data, new double[] { 0.75f }, generateNumKeys(data._key, 2), null);
//.join();
H2O.submitTask(fs);
Frame[] train_test = fs.getResult();
train = train_test[0];
test = train_test[1];
Log.info("Done...");
// add missing values to the training data (excluding the response)
if (missing_fraction > 0) {
Frame frtmp = new Frame(Key.<Frame>make(), train.names(), train.vecs());
//exclude the response
frtmp.remove(frtmp.numCols() - 1);
//need to put the frame (to be modified) into DKV for MissingInserter to pick up
DKV.put(frtmp._key, frtmp);
FrameUtils.MissingInserter j = new FrameUtils.MissingInserter(frtmp._key, seed, missing_fraction);
//MissingInserter is non-blocking, must block here explicitly
j.execImpl().get();
//Delete the frame header (not the data)
DKV.remove(frtmp._key);
}
// Build a regularized GBM model with polluted training data, score on clean validation set
p = new GBMModel.GBMParameters();
p._train = train._key;
p._valid = test._key;
p._response_column = train._names[train.numCols() - 1];
//only for weather data
p._ignored_columns = new String[] { train._names[1], train._names[22] };
p._seed = seed;
// Convert response to categorical
int ri = train.numCols() - 1;
int ci = test.find(p._response_column);
Scope.track(train.replace(ri, train.vecs()[ri].toCategoricalVec()));
Scope.track(test.replace(ci, test.vecs()[ci].toCategoricalVec()));
DKV.put(train);
DKV.put(test);
GBM gbm = new GBM(p);
Log.info("Starting with " + missing_fraction * 100 + "% missing values added.");
mymodel = gbm.trainModel().get();
// Extract the scoring on validation set from the model
err = ((ModelMetricsBinomial) mymodel._output._validation_metrics).logloss();
Frame train_preds = mymodel.score(train);
Assert.assertTrue(mymodel.testJavaScoring(train, train_preds, 1e-15));
train_preds.remove();
Log.info("Missing " + missing_fraction * 100 + "% -> logloss: " + err);
} catch (Throwable t) {
t.printStackTrace();
err = 100;
} finally {
Scope.exit();
// cleanup
if (mymodel != null) {
mymodel.delete();
}
if (train != null)
train.delete();
if (test != null)
test.delete();
if (data != null)
data.delete();
}
map.put(missing_fraction, err);
sumerr += err;
}
sb.append("missing fraction --> Error\n");
for (String s : Arrays.toString(map.entrySet().toArray()).split(",")) sb.append(s.replace("=", " --> ")).append("\n");
sb.append('\n');
sb.append("Sum Err: ").append(sumerr).append("\n");
Log.info(sb.toString());
}
use of water.fvec.Frame in project h2o-3 by h2oai.
the class GBMTest method testReprodubilityAirlineSingleNode.
@Test
public void testReprodubilityAirlineSingleNode() {
Frame tfr = null;
final int N = 10;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// DKV.put(tfr);
for (String s : new String[] { "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i = 0; i < N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 500;
parms._ntrees = 7;
parms._max_depth = 5;
parms._min_rows = 10;
parms._distribution = DistributionFamily.bernoulli;
parms._balance_classes = true;
parms._seed = 0;
parms._build_tree_one_node = true;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length - 1]._mse;
gbm.delete();
}
} finally {
if (tfr != null)
tfr.remove();
}
Scope.exit();
System.out.println("MSE");
for (double d : mses) System.out.println(d);
for (double mse : mses) //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
assertEquals(0.21694215729861027, mse, 1e-8);
}
Aggregations