use of water.fvec.Vec in project h2o-3 by h2oai.
the class AUC2 method perfectAUC.
// ==========
// Given the probabilities of a 1, and the actuals (0/1) report the perfect
// AUC found by sorting the entire dataset. Expensive, and only works for
// small data (probably caps out at about 10M rows).
public static double perfectAUC(Vec vprob, Vec vacts) {
if (vacts.min() < 0 || vacts.max() > 1 || !vacts.isInt())
throw new IllegalArgumentException("Actuals are either 0 or 1");
if (vprob.min() < 0 || vprob.max() > 1)
throw new IllegalArgumentException("Probabilities are between 0 and 1");
// Horrible data replication into array of structs, to sort.
Pair[] ps = new Pair[(int) vprob.length()];
Vec.Reader rprob = vprob.new Reader();
Vec.Reader racts = vacts.new Reader();
for (int i = 0; i < ps.length; i++) ps[i] = new Pair(rprob.at(i), (byte) racts.at8(i));
return perfectAUC(ps);
}
use of water.fvec.Vec in project h2o-3 by h2oai.
the class ConfusionMatrix method buildCM.
/** Build the CM data from the actuals and predictions, using the default
* threshold. Print to Log.info if the number of classes is below the
* print_threshold. Actuals might have extra levels not trained on (hence
* never predicted). Actuals with NAs are not scored, and their predictions
* ignored. */
public static ConfusionMatrix buildCM(Vec actuals, Vec predictions) {
if (!actuals.isCategorical())
throw new IllegalArgumentException("actuals must be categorical.");
if (!predictions.isCategorical())
throw new IllegalArgumentException("predictions must be categorical.");
Scope.enter();
try {
Vec adapted = predictions.adaptTo(actuals.domain());
int len = actuals.domain().length;
CMBuilder cm = new CMBuilder(len).doAll(actuals, adapted);
return new ConfusionMatrix(cm._arr, actuals.domain());
} finally {
Scope.exit();
}
}
use of water.fvec.Vec in project h2o-3 by h2oai.
the class DMatrix method transpose.
/**
* Transpose the Frame as if it was a matrix (i.e. rows become coumns).
* Must be all numeric, currently will fail if there are too many rows ( >= ~.5M).
* Result will be put into a new Vectro Group and will be balanced so that each vec will have
* (4*num cpus in the cluster) chunks.
*
* @param src
* @return
*/
public static Frame transpose(Frame src) {
if (src.numRows() != (int) src.numRows())
throw H2O.unimpl();
int nchunks = Math.max(1, src.numCols() / 10000);
long[] espc = new long[nchunks + 1];
int rpc = (src.numCols() / nchunks);
int rem = (src.numCols() % nchunks);
Arrays.fill(espc, rpc);
for (int i = 0; i < rem; ++i) ++espc[i];
long sum = 0;
for (int i = 0; i < espc.length; ++i) {
long s = espc[i];
espc[i] = sum;
sum += s;
}
Key key = Vec.newKey();
int rowLayout = Vec.ESPC.rowLayout(key, espc);
return transpose(src, new Frame(new Vec(key, rowLayout).makeZeros((int) src.numRows())));
}
use of water.fvec.Vec in project h2o-3 by h2oai.
the class GainsLift method init.
private void init(Job job) throws IllegalArgumentException {
_labels = _labels.toCategoricalVec();
if (_labels == null || _preds == null)
throw new IllegalArgumentException("Missing actualLabels or predictedProbs!");
if (_labels.length() != _preds.length())
throw new IllegalArgumentException("Both arguments must have the same length (" + _labels.length() + "!=" + _preds.length() + ")!");
if (!_labels.isInt())
throw new IllegalArgumentException("Actual column must be integer class labels!");
if (_labels.cardinality() != -1 && _labels.cardinality() != 2)
throw new IllegalArgumentException("Actual column must contain binary class labels, but found cardinality " + _labels.cardinality() + "!");
if (_preds.isCategorical())
throw new IllegalArgumentException("Predicted probabilities cannot be class labels, expect probabilities.");
if (_weights != null && !_weights.isNumeric())
throw new IllegalArgumentException("Observation weights must be numeric.");
// The vectors are from different groups => align them, but properly delete it after computation
if (!_labels.group().equals(_preds.group())) {
_preds = _labels.align(_preds);
Scope.track(_preds);
if (_weights != null) {
_weights = _labels.align(_weights);
Scope.track(_weights);
}
}
boolean fast = false;
if (fast) {
// FAST VERSION: single-pass, only works with the specific pre-computed quantiles from rollupstats
assert (_groups == 10);
assert (Arrays.equals(Vec.PERCENTILES, // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15, 16
new double[] { 0.001, 0.01, 0.1, 0.2, 0.25, 0.3, 1.0 / 3.0, 0.4, 0.5, 0.6, 2.0 / 3.0, 0.7, 0.75, 0.8, 0.9, 0.99, 0.999 }));
//HACK: hardcoded quantiles for simplicity (0.9,0.8,...,0.1,0)
//might do a full pass over the Vec
double[] rq = _preds.pctiles();
_quantiles = new double[] { rq[14], rq[13], rq[11], rq[9], rq[8], rq[7], rq[5], rq[3], rq[2], 0 };
} else {
// ACCURATE VERSION: multi-pass
Frame fr = null;
QuantileModel qm = null;
try {
QuantileModel.QuantileParameters qp = new QuantileModel.QuantileParameters();
if (_weights == null) {
fr = new Frame(Key.<Frame>make(), new String[] { "predictions" }, new Vec[] { _preds });
} else {
fr = new Frame(Key.<Frame>make(), new String[] { "predictions", "weights" }, new Vec[] { _preds, _weights });
qp._weights_column = "weights";
}
DKV.put(fr);
qp._train = fr._key;
if (_groups > 0) {
qp._probs = new double[_groups];
for (int i = 0; i < _groups; ++i) {
// This is 0.9, 0.8, 0.7, 0.6, ..., 0.1, 0 for 10 groups
qp._probs[i] = (_groups - i - 1.) / _groups;
}
} else {
qp._probs = new double[] { 0.99, 0.98, 0.97, 0.96, 0.95, 0.9, 0.85, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0 };
}
qm = job != null && !job.isDone() ? new Quantile(qp, job).trainModelNested(null) : new Quantile(qp).trainModel().get();
_quantiles = qm._output._quantiles[0];
// find uniques (is there a more elegant way?)
TreeSet<Double> hs = new TreeSet<>();
for (double d : _quantiles) hs.add(d);
_quantiles = new double[hs.size()];
Iterator<Double> it = hs.descendingIterator();
int i = 0;
while (it.hasNext()) _quantiles[i++] = it.next();
} finally {
if (qm != null)
qm.remove();
if (fr != null)
DKV.remove(fr._key);
}
}
}
use of water.fvec.Vec in project h2o-3 by h2oai.
the class GBMGridTest method testDuplicatesCarsGrid.
//@Ignore("PUBDEV-1643")
@Test
public void testDuplicatesCarsGrid() {
Grid grid = null;
Frame fr = null;
Vec old = null;
try {
fr = parse_test_file("smalldata/junit/cars_20mpg.csv");
// Remove unique id
fr.remove("name").remove();
old = fr.remove("economy");
// response to last column
fr.add("economy", old);
DKV.put(fr);
// Setup random hyperparameter search space
HashMap<String, Object[]> hyperParms = new HashMap<String, Object[]>() {
{
put("_distribution", new DistributionFamily[] { DistributionFamily.gaussian });
put("_ntrees", new Integer[] { 5, 5 });
put("_max_depth", new Integer[] { 2, 2 });
put("_learn_rate", new Double[] { .1, .1 });
}
};
// Fire off a grid search
GBMModel.GBMParameters params = new GBMModel.GBMParameters();
params._train = fr._key;
params._response_column = "economy";
Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
grid = gs.get();
// Check that duplicate model have not been constructed
Model[] models = grid.getModels();
assertTrue("Number of returned models has to be > 0", models.length > 0);
// But all off them should be same
Key<Model> modelKey = models[0]._key;
for (Model m : models) {
assertTrue("Number of constructed models has to be equal to 1", modelKey == m._key);
}
} finally {
if (old != null) {
old.remove();
}
if (fr != null) {
fr.remove();
}
if (grid != null) {
grid.remove();
}
}
}
Aggregations