Search in sources :

Example 26 with Frame

use of water.fvec.Frame in project h2o-3 by h2oai.

the class DMatrix method transpose.

/**
   * Transpose the Frame as if it was a matrix (i.e. rows become coumns).
   * Must be all numeric, currently will fail if there are too many rows ( >= ~.5M).
   * Result will be put into a new Vectro Group and will be balanced so that each vec will have
   * (4*num cpus in the cluster) chunks.
   *
   * @param src
   * @return
   */
public static Frame transpose(Frame src) {
    if (src.numRows() != (int) src.numRows())
        throw H2O.unimpl();
    int nchunks = Math.max(1, src.numCols() / 10000);
    long[] espc = new long[nchunks + 1];
    int rpc = (src.numCols() / nchunks);
    int rem = (src.numCols() % nchunks);
    Arrays.fill(espc, rpc);
    for (int i = 0; i < rem; ++i) ++espc[i];
    long sum = 0;
    for (int i = 0; i < espc.length; ++i) {
        long s = espc[i];
        espc[i] = sum;
        sum += s;
    }
    Key key = Vec.newKey();
    int rowLayout = Vec.ESPC.rowLayout(key, espc);
    return transpose(src, new Frame(new Vec(key, rowLayout).makeZeros((int) src.numRows())));
}
Also used : Frame(water.fvec.Frame) Vec(water.fvec.Vec)

Example 27 with Frame

use of water.fvec.Frame in project h2o-3 by h2oai.

the class GainsLift method init.

private void init(Job job) throws IllegalArgumentException {
    _labels = _labels.toCategoricalVec();
    if (_labels == null || _preds == null)
        throw new IllegalArgumentException("Missing actualLabels or predictedProbs!");
    if (_labels.length() != _preds.length())
        throw new IllegalArgumentException("Both arguments must have the same length (" + _labels.length() + "!=" + _preds.length() + ")!");
    if (!_labels.isInt())
        throw new IllegalArgumentException("Actual column must be integer class labels!");
    if (_labels.cardinality() != -1 && _labels.cardinality() != 2)
        throw new IllegalArgumentException("Actual column must contain binary class labels, but found cardinality " + _labels.cardinality() + "!");
    if (_preds.isCategorical())
        throw new IllegalArgumentException("Predicted probabilities cannot be class labels, expect probabilities.");
    if (_weights != null && !_weights.isNumeric())
        throw new IllegalArgumentException("Observation weights must be numeric.");
    // The vectors are from different groups => align them, but properly delete it after computation
    if (!_labels.group().equals(_preds.group())) {
        _preds = _labels.align(_preds);
        Scope.track(_preds);
        if (_weights != null) {
            _weights = _labels.align(_weights);
            Scope.track(_weights);
        }
    }
    boolean fast = false;
    if (fast) {
        // FAST VERSION: single-pass, only works with the specific pre-computed quantiles from rollupstats
        assert (_groups == 10);
        assert (Arrays.equals(Vec.PERCENTILES, //             0      1    2    3    4     5        6          7    8   9   10          11    12   13   14    15, 16
        new double[] { 0.001, 0.01, 0.1, 0.2, 0.25, 0.3, 1.0 / 3.0, 0.4, 0.5, 0.6, 2.0 / 3.0, 0.7, 0.75, 0.8, 0.9, 0.99, 0.999 }));
        //HACK: hardcoded quantiles for simplicity (0.9,0.8,...,0.1,0)
        //might do a full pass over the Vec
        double[] rq = _preds.pctiles();
        _quantiles = new double[] { rq[14], rq[13], rq[11], rq[9], rq[8], rq[7], rq[5], rq[3], rq[2], 0 };
    } else {
        // ACCURATE VERSION: multi-pass
        Frame fr = null;
        QuantileModel qm = null;
        try {
            QuantileModel.QuantileParameters qp = new QuantileModel.QuantileParameters();
            if (_weights == null) {
                fr = new Frame(Key.<Frame>make(), new String[] { "predictions" }, new Vec[] { _preds });
            } else {
                fr = new Frame(Key.<Frame>make(), new String[] { "predictions", "weights" }, new Vec[] { _preds, _weights });
                qp._weights_column = "weights";
            }
            DKV.put(fr);
            qp._train = fr._key;
            if (_groups > 0) {
                qp._probs = new double[_groups];
                for (int i = 0; i < _groups; ++i) {
                    // This is 0.9, 0.8, 0.7, 0.6, ..., 0.1, 0 for 10 groups
                    qp._probs[i] = (_groups - i - 1.) / _groups;
                }
            } else {
                qp._probs = new double[] { 0.99, 0.98, 0.97, 0.96, 0.95, 0.9, 0.85, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0 };
            }
            qm = job != null && !job.isDone() ? new Quantile(qp, job).trainModelNested(null) : new Quantile(qp).trainModel().get();
            _quantiles = qm._output._quantiles[0];
            // find uniques (is there a more elegant way?)
            TreeSet<Double> hs = new TreeSet<>();
            for (double d : _quantiles) hs.add(d);
            _quantiles = new double[hs.size()];
            Iterator<Double> it = hs.descendingIterator();
            int i = 0;
            while (it.hasNext()) _quantiles[i++] = it.next();
        } finally {
            if (qm != null)
                qm.remove();
            if (fr != null)
                DKV.remove(fr._key);
        }
    }
}
Also used : Frame(water.fvec.Frame) PrettyPrint(water.util.PrettyPrint) QuantileModel(hex.quantile.QuantileModel) Vec(water.fvec.Vec) TreeSet(java.util.TreeSet) Quantile(hex.quantile.Quantile)

Example 28 with Frame

use of water.fvec.Frame in project h2o-3 by h2oai.

the class GBMGridTest method testDuplicatesCarsGrid.

//@Ignore("PUBDEV-1643")
@Test
public void testDuplicatesCarsGrid() {
    Grid grid = null;
    Frame fr = null;
    Vec old = null;
    try {
        fr = parse_test_file("smalldata/junit/cars_20mpg.csv");
        // Remove unique id
        fr.remove("name").remove();
        old = fr.remove("economy");
        // response to last column
        fr.add("economy", old);
        DKV.put(fr);
        // Setup random hyperparameter search space
        HashMap<String, Object[]> hyperParms = new HashMap<String, Object[]>() {

            {
                put("_distribution", new DistributionFamily[] { DistributionFamily.gaussian });
                put("_ntrees", new Integer[] { 5, 5 });
                put("_max_depth", new Integer[] { 2, 2 });
                put("_learn_rate", new Double[] { .1, .1 });
            }
        };
        // Fire off a grid search
        GBMModel.GBMParameters params = new GBMModel.GBMParameters();
        params._train = fr._key;
        params._response_column = "economy";
        Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
        grid = gs.get();
        // Check that duplicate model have not been constructed
        Model[] models = grid.getModels();
        assertTrue("Number of returned models has to be > 0", models.length > 0);
        // But all off them should be same
        Key<Model> modelKey = models[0]._key;
        for (Model m : models) {
            assertTrue("Number of constructed models has to be equal to 1", modelKey == m._key);
        }
    } finally {
        if (old != null) {
            old.remove();
        }
        if (fr != null) {
            fr.remove();
        }
        if (grid != null) {
            grid.remove();
        }
    }
}
Also used : Frame(water.fvec.Frame) HashMap(java.util.HashMap) Grid(hex.grid.Grid) Vec(water.fvec.Vec) Model(hex.Model) Test(org.junit.Test)

Example 29 with Frame

use of water.fvec.Frame in project h2o-3 by h2oai.

the class GBMMissingTest method run.

@Test
public void run() {
    long seed = 1234;
    GBMModel mymodel = null;
    Frame train = null;
    Frame test = null;
    Frame data = null;
    GBMModel.GBMParameters p;
    Log.info("");
    Log.info("STARTING.");
    Log.info("Using seed " + seed);
    StringBuilder sb = new StringBuilder();
    double sumerr = 0;
    Map<Double, Double> map = new TreeMap<>();
    for (double missing_fraction : new double[] { 0, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99 }) {
        double err = 0;
        try {
            Scope.enter();
            NFSFileVec nfs = TestUtil.makeNfsFileVec("smalldata/junit/weather.csv");
            data = ParseDataset.parse(Key.make("data.hex"), nfs._key);
            Log.info("FrameSplitting");
            // Create holdout test data on clean data (before adding missing values)
            FrameSplitter fs = new FrameSplitter(data, new double[] { 0.75f }, generateNumKeys(data._key, 2), null);
            //.join();
            H2O.submitTask(fs);
            Frame[] train_test = fs.getResult();
            train = train_test[0];
            test = train_test[1];
            Log.info("Done...");
            // add missing values to the training data (excluding the response)
            if (missing_fraction > 0) {
                Frame frtmp = new Frame(Key.<Frame>make(), train.names(), train.vecs());
                //exclude the response
                frtmp.remove(frtmp.numCols() - 1);
                //need to put the frame (to be modified) into DKV for MissingInserter to pick up
                DKV.put(frtmp._key, frtmp);
                FrameUtils.MissingInserter j = new FrameUtils.MissingInserter(frtmp._key, seed, missing_fraction);
                //MissingInserter is non-blocking, must block here explicitly
                j.execImpl().get();
                //Delete the frame header (not the data)
                DKV.remove(frtmp._key);
            }
            // Build a regularized GBM model with polluted training data, score on clean validation set
            p = new GBMModel.GBMParameters();
            p._train = train._key;
            p._valid = test._key;
            p._response_column = train._names[train.numCols() - 1];
            //only for weather data
            p._ignored_columns = new String[] { train._names[1], train._names[22] };
            p._seed = seed;
            // Convert response to categorical
            int ri = train.numCols() - 1;
            int ci = test.find(p._response_column);
            Scope.track(train.replace(ri, train.vecs()[ri].toCategoricalVec()));
            Scope.track(test.replace(ci, test.vecs()[ci].toCategoricalVec()));
            DKV.put(train);
            DKV.put(test);
            GBM gbm = new GBM(p);
            Log.info("Starting with " + missing_fraction * 100 + "% missing values added.");
            mymodel = gbm.trainModel().get();
            // Extract the scoring on validation set from the model
            err = ((ModelMetricsBinomial) mymodel._output._validation_metrics).logloss();
            Frame train_preds = mymodel.score(train);
            Assert.assertTrue(mymodel.testJavaScoring(train, train_preds, 1e-15));
            train_preds.remove();
            Log.info("Missing " + missing_fraction * 100 + "% -> logloss: " + err);
        } catch (Throwable t) {
            t.printStackTrace();
            err = 100;
        } finally {
            Scope.exit();
            // cleanup
            if (mymodel != null) {
                mymodel.delete();
            }
            if (train != null)
                train.delete();
            if (test != null)
                test.delete();
            if (data != null)
                data.delete();
        }
        map.put(missing_fraction, err);
        sumerr += err;
    }
    sb.append("missing fraction --> Error\n");
    for (String s : Arrays.toString(map.entrySet().toArray()).split(",")) sb.append(s.replace("=", " --> ")).append("\n");
    sb.append('\n');
    sb.append("Sum Err: ").append(sumerr).append("\n");
    Log.info(sb.toString());
}
Also used : FrameUtils(water.util.FrameUtils) Frame(water.fvec.Frame) NFSFileVec(water.fvec.NFSFileVec) TreeMap(java.util.TreeMap) FrameSplitter(hex.FrameSplitter) Test(org.junit.Test)

Example 30 with Frame

use of water.fvec.Frame in project h2o-3 by h2oai.

the class GBMTest method testReprodubilityAirlineSingleNode.

@Test
public void testReprodubilityAirlineSingleNode() {
    Frame tfr = null;
    final int N = 10;
    double[] mses = new double[N];
    Scope.enter();
    try {
        // Load data, hack frames
        tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
        // rebalance to fixed number of chunks
        Key dest = Key.make("df.rebalanced.hex");
        RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
        H2O.submitTask(rb);
        rb.join();
        tfr.delete();
        tfr = DKV.get(dest).get();
        //      DKV.put(tfr);
        for (String s : new String[] { "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) {
            tfr.remove(s).remove();
        }
        DKV.put(tfr);
        for (int i = 0; i < N; ++i) {
            GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
            parms._train = tfr._key;
            parms._response_column = "IsDepDelayed";
            parms._nbins = 10;
            parms._nbins_cats = 500;
            parms._ntrees = 7;
            parms._max_depth = 5;
            parms._min_rows = 10;
            parms._distribution = DistributionFamily.bernoulli;
            parms._balance_classes = true;
            parms._seed = 0;
            parms._build_tree_one_node = true;
            // Build a first model; all remaining models should be equal
            GBMModel gbm = new GBM(parms).trainModel().get();
            assertEquals(gbm._output._ntrees, parms._ntrees);
            mses[i] = gbm._output._scored_train[gbm._output._scored_train.length - 1]._mse;
            gbm.delete();
        }
    } finally {
        if (tfr != null)
            tfr.remove();
    }
    Scope.exit();
    System.out.println("MSE");
    for (double d : mses) System.out.println(d);
    for (double mse : mses) //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
    assertEquals(0.21694215729861027, mse, 1e-8);
}
Also used : Frame(water.fvec.Frame) RebalanceDataSet(water.fvec.RebalanceDataSet) Test(org.junit.Test)

Aggregations

Frame (water.fvec.Frame)782 Test (org.junit.Test)435 Vec (water.fvec.Vec)215 ValFrame (water.rapids.vals.ValFrame)132 NFSFileVec (water.fvec.NFSFileVec)66 Val (water.rapids.Val)65 SplitFrame (hex.SplitFrame)59 Key (water.Key)56 DeepLearningParameters (hex.deeplearning.DeepLearningModel.DeepLearningParameters)54 Chunk (water.fvec.Chunk)50 NewChunk (water.fvec.NewChunk)37 MRTask (water.MRTask)33 ShuffleSplitFrame (hex.splitframe.ShuffleSplitFrame)31 Ignore (org.junit.Ignore)28 Random (java.util.Random)26 File (java.io.File)25 BufferedString (water.parser.BufferedString)21 H2OIllegalArgumentException (water.exceptions.H2OIllegalArgumentException)19 HashMap (java.util.HashMap)17 hex (hex)16