Search in sources :

Example 56 with Vec

use of water.fvec.Vec in project h2o-3 by h2oai.

the class DRFGridTest method testDuplicatesCarsGrid.

public void testDuplicatesCarsGrid() {
    Grid grid = null;
    Frame fr = null;
    Vec old = null;
    try {
        fr = parse_test_file("smalldata/junit/cars_20mpg.csv");
        // Remove unique id
        old = fr.remove("economy");
        // response to last column
        fr.add("economy", old);
        // Setup random hyperparameter search space
        HashMap<String, Object[]> hyperParms = new HashMap<String, Object[]>() {

                put("_ntrees", new Integer[] { 5, 5 });
                put("_max_depth", new Integer[] { 2, 2 });
                put("_mtries", new Integer[] { -1, -1 });
                put("_sample_rate", new Double[] { .1, .1 });
        // Fire off a grid search
        DRFModel.DRFParameters params = new DRFModel.DRFParameters();
        params._train = fr._key;
        params._response_column = "economy";
        // Get the Grid for this modeling class and frame
        Job<Grid> gs = GridSearch.startGridSearch(null, params, hyperParms);
        grid = gs.get();
        // Check that duplicate model have not been constructed
        Model[] models = grid.getModels();
        assertTrue("Number of returned models has to be > 0", models.length > 0);
        // But all off them should be same
        Key<Model> modelKey = models[0]._key;
        for (Model m : models) {
            assertTrue("Number of constructed models has to be equal to 1", modelKey == m._key);
    } finally {
        if (old != null) {
        if (fr != null) {
        if (grid != null) {
Also used : Frame(water.fvec.Frame) HashMap(java.util.HashMap) Grid(hex.grid.Grid) Vec(water.fvec.Vec) Model(hex.Model) Test(org.junit.Test)

Example 57 with Vec

use of water.fvec.Vec in project h2o-3 by h2oai.

the class DRFTest method testNfoldsConsecutiveModelsSame.

public void testNfoldsConsecutiveModelsSame() {
    Frame tfr = null;
    Vec old = null;
    DRFModel drf1 = null;
    DRFModel drf2 = null;
    try {
        tfr = parse_test_file("smalldata/junit/cars_20mpg.csv");
        // Remove unique id
        old = tfr.remove("economy_20mpg");
        // response to last column
        tfr.add("economy_20mpg", VecUtils.toCategoricalVec(old));
        DRFModel.DRFParameters parms = new DRFModel.DRFParameters();
        parms._train = tfr._key;
        parms._response_column = "economy_20mpg";
        parms._min_rows = 2;
        parms._max_depth = 2;
        parms._nfolds = 3;
        parms._ntrees = 3;
        parms._seed = 77777;
        drf1 = new DRF(parms).trainModel().get();
        drf2 = new DRF(parms).trainModel().get();
        ModelMetricsBinomial mm1 = (ModelMetricsBinomial) drf1._output._cross_validation_metrics;
        ModelMetricsBinomial mm2 = (ModelMetricsBinomial) drf2._output._cross_validation_metrics;
        assertEquals(mm1.auc_obj()._auc, mm2.auc_obj()._auc, 1e-12);
        assertEquals(mm1.mse(), mm2.mse(), 1e-12);
        assertEquals(mm1.logloss(), mm2.logloss(), 1e-12);
    } finally {
        if (tfr != null)
        if (old != null)
        if (drf1 != null) {
        if (drf2 != null) {
Also used : Frame(water.fvec.Frame) SplitFrame(hex.SplitFrame) Vec(water.fvec.Vec) ModelMetricsBinomial(hex.ModelMetricsBinomial) Test(org.junit.Test)

Example 58 with Vec

use of water.fvec.Vec in project h2o-3 by h2oai.

the class Merge method merge.

// single-threaded driver logic.  Merge left and right frames based on common columns.
public static Frame merge(final Frame leftFrame, final Frame riteFrame, final int[] leftCols, final int[] riteCols, boolean allLeft, int[][] id_maps) {
    final boolean hasRite = riteCols.length > 0;
    // for now to save a deep branch later
    for (int i = 0; i < id_maps.length; i++) {
        if (id_maps[i] == null)
        assert id_maps[i].length >= leftFrame.vec(leftCols[i]).max() + 1;
        if (!hasRite)
        int right_max = (int) riteFrame.vec(riteCols[i]).max();
        for (int j = 0; j < id_maps[i].length; j++) {
            assert id_maps[i][j] >= 0;
            if (id_maps[i][j] > right_max)
                id_maps[i][j] = -1;
    // Running 3 consecutive times on an idle cluster showed that running left
    // and right in parallel was a little slower (97s) than one by one (89s).
    // TODO: retest in future
    RadixOrder leftIndex = createIndex(true, leftFrame, leftCols, id_maps);
    RadixOrder riteIndex = createIndex(false, riteFrame, riteCols, id_maps);
    // TODO: start merging before all indexes had been created. Use callback?
    System.out.print("Making BinaryMerge RPC calls ... ");
    long t0 = System.nanoTime();
    ArrayList<BinaryMerge> bmList = new ArrayList<>();
    Futures fs = new Futures();
    final int leftShift = leftIndex._shift[0];
    final long leftBase = leftIndex._base[0];
    final int riteShift = hasRite ? riteIndex._shift[0] : -1;
    final long riteBase = hasRite ? riteIndex._base[0] : leftBase;
    // which leftMSB does the overlap start
    long leftMSBfrom = (riteBase - leftBase) >> leftShift;
    // deal with the left range below the right minimum, if any
    if (leftBase < riteBase) {
        // deal with the range of the left below the start of the right, if any
        assert leftMSBfrom >= 0;
        if (leftMSBfrom > 255) {
            // The left range ends before the right range starts.  So every left row is a no-match to the right
            // so that the loop below runs for all MSBs (0-255) to fetch the left rows only
            leftMSBfrom = 256;
        // BinaryMerge (if _allLeft)
        if (allLeft)
            for (int leftMSB = 0; leftMSB < leftMSBfrom; leftMSB++) {
                BinaryMerge bm = new BinaryMerge(new BinaryMerge.FFSB(leftFrame, leftMSB, leftShift, leftIndex._bytesUsed, leftIndex._base), new BinaryMerge.FFSB(riteFrame, /*rightMSB*/
                -1, riteShift, riteIndex._bytesUsed, riteIndex._base), true);
                fs.add(new RPC<>(SplitByMSBLocal.ownerOfMSB(leftMSB), bm).call());
    } else {
        // completely ignore right MSBs below the left base
        assert leftMSBfrom <= 0;
        leftMSBfrom = 0;
    long leftMSBto = (riteBase + (256L << riteShift) - 1 - leftBase) >> leftShift;
    // deal with the left range above the right maximum, if any
    if ((leftBase + (256L << leftShift)) > (riteBase + (256L << riteShift))) {
        assert leftMSBto <= 255;
        if (leftMSBto < 0) {
            // The left range starts after the right range ends.  So every left row
            // is a no-match to the right
            // all MSBs (0-255) need to fetch the left rows only
            leftMSBto = -1;
        // run the merge for the whole lefts that start after the last right
        if (allLeft)
            for (int leftMSB = (int) leftMSBto + 1; leftMSB <= 255; leftMSB++) {
                BinaryMerge bm = new BinaryMerge(new BinaryMerge.FFSB(leftFrame, leftMSB, leftShift, leftIndex._bytesUsed, leftIndex._base), new BinaryMerge.FFSB(riteFrame, /*rightMSB*/
                -1, riteShift, riteIndex._bytesUsed, riteIndex._base), true);
                fs.add(new RPC<>(SplitByMSBLocal.ownerOfMSB(leftMSB), bm).call());
    } else {
        // completely ignore right MSBs after the right peak
        assert leftMSBto >= 255;
        leftMSBto = 255;
    // the overlapped region; i.e. between [ max(leftMin,rightMin), min(leftMax, rightMax) ]
    for (int leftMSB = (int) leftMSBfrom; leftMSB <= leftMSBto; leftMSB++) {
        assert leftMSB >= 0;
        assert leftMSB <= 255;
        // calculate the key values at the bin extents:  [leftFrom,leftTo] in terms of keys
        // -1 for leading NA spot
        long leftFrom = (((long) leftMSB) << leftShift) - 1 + leftBase;
        // -1 for leading NA spot and another -1 to get last of previous bin
        long leftTo = (((long) leftMSB + 1) << leftShift) - 1 + leftBase - 1;
        // which right bins do these left extents occur in (could span multiple, and fall in the middle)
        // +1 again for the leading NA spot
        int rightMSBfrom = (int) ((leftFrom - riteBase + 1) >> riteShift);
        int rightMSBto = (int) ((leftTo - riteBase + 1) >> riteShift);
        // the non-matching part of this region will have been dealt with above when allLeft==true
        if (rightMSBfrom < 0)
            rightMSBfrom = 0;
        assert rightMSBfrom <= 255;
        if (rightMSBto > 255)
            rightMSBto = 255;
        assert rightMSBto >= rightMSBfrom;
        for (int rightMSB = rightMSBfrom; rightMSB <= rightMSBto; rightMSB++) {
            BinaryMerge bm = new BinaryMerge(new BinaryMerge.FFSB(leftFrame, leftMSB, leftShift, leftIndex._bytesUsed, leftIndex._base), new BinaryMerge.FFSB(riteFrame, rightMSB, riteShift, riteIndex._bytesUsed, riteIndex._base), allLeft);
            // TODO: choose the bigger side to execute on (where that side of index
            // already is) to minimize transfer.  within BinaryMerge it will
            // recalculate the extents in terms of keys and bsearch for them within
            // the (then local) both sides
            H2ONode node = SplitByMSBLocal.ownerOfMSB(rightMSB);
            fs.add(new RPC<>(node, bm).call());
    System.out.println("took: " + String.format("%.3f", (System.nanoTime() - t0) / 1e9));
    t0 = System.nanoTime();
    System.out.println("Sending BinaryMerge async RPC calls in a queue ... ");
    System.out.println("took: " + (System.nanoTime() - t0) / 1e9);
    System.out.print("Removing DKV keys of left and right index.  ... ");
    // TODO: In future we won't delete but rather persist them as index on the table
    // Explicitly deleting here (rather than Arno's cleanUp) to reveal if we're not removing keys early enough elsewhere
    t0 = System.nanoTime();
    for (int msb = 0; msb < 256; msb++) {
        for (int isLeft = 0; isLeft < 2; isLeft++) {
            Key k = getSortedOXHeaderKey(isLeft != 0, msb);
            SingleThreadRadixOrder.OXHeader oxheader = DKV.getGet(k);
            if (oxheader != null) {
                for (int b = 0; b < oxheader._nBatch; ++b) {
                    k = SplitByMSBLocal.getSortedOXbatchKey(isLeft != 0, msb, b);
    System.out.println("took: " + (System.nanoTime() - t0) / 1e9);
    System.out.print("Allocating and populating chunk info (e.g. size and batch number) ...");
    t0 = System.nanoTime();
    long ansN = 0;
    int numChunks = 0;
    for (BinaryMerge thisbm : bmList) if (thisbm._numRowsInResult > 0) {
        numChunks += thisbm._chunkSizes.length;
        ansN += thisbm._numRowsInResult;
    long[] chunkSizes = new long[numChunks];
    // using too much space repeating the same value here, but, limited
    int[] chunkLeftMSB = new int[numChunks];
    int[] chunkRightMSB = new int[numChunks];
    int[] chunkBatch = new int[numChunks];
    int k = 0;
    for (BinaryMerge thisbm : bmList) {
        if (thisbm._numRowsInResult == 0)
        int[] thisChunkSizes = thisbm._chunkSizes;
        for (int j = 0; j < thisChunkSizes.length; j++) {
            chunkSizes[k] = thisChunkSizes[j];
            chunkLeftMSB[k] = thisbm._leftSB._msb;
            chunkRightMSB[k] = thisbm._riteSB._msb;
            chunkBatch[k] = j;
    System.out.println("took: " + (System.nanoTime() - t0) / 1e9);
    // Now we can stitch together the final frame from the raw chunks that were
    // put into the store
    System.out.print("Allocating and populated espc ...");
    t0 = System.nanoTime();
    long[] espc = new long[chunkSizes.length + 1];
    int i = 0;
    long sum = 0;
    for (long s : chunkSizes) {
        espc[i++] = sum;
        sum += s;
    espc[espc.length - 1] = sum;
    System.out.println("took: " + (System.nanoTime() - t0) / 1e9);
    assert (sum == ansN);
    System.out.print("Allocating dummy vecs/chunks of the final frame ...");
    t0 = System.nanoTime();
    int numJoinCols = hasRite ? leftIndex._bytesUsed.length : 0;
    int numLeftCols = leftFrame.numCols();
    int numColsInResult = numLeftCols + riteFrame.numCols() - numJoinCols;
    final byte[] types = new byte[numColsInResult];
    final String[][] doms = new String[numColsInResult][];
    final String[] names = new String[numColsInResult];
    for (int j = 0; j < numLeftCols; j++) {
        types[j] = leftFrame.vec(j).get_type();
        doms[j] =[j];
        names[j] = leftFrame.names()[j];
    for (int j = 0; j < riteFrame.numCols() - numJoinCols; j++) {
        types[numLeftCols + j] = riteFrame.vec(j + numJoinCols).get_type();
        doms[numLeftCols + j] =[j + numJoinCols];
        names[numLeftCols + j] = riteFrame.names()[j + numJoinCols];
    Key<Vec> key = Vec.newKey();
    Vec[] vecs = new Vec(key, Vec.ESPC.rowLayout(key, espc)).makeCons(numColsInResult, 0, doms, types);
    System.out.println("took: " + (System.nanoTime() - t0) / 1e9);
    System.out.print("Finally stitch together by overwriting dummies ...");
    t0 = System.nanoTime();
    Frame fr = new Frame(names, vecs);
    ChunkStitcher ff = new ChunkStitcher(chunkSizes, chunkLeftMSB, chunkRightMSB, chunkBatch);
    System.out.println("took: " + (System.nanoTime() - t0) / 1e9);
    return fr;
Also used : Frame(water.fvec.Frame) ArrayList(java.util.ArrayList) Vec(water.fvec.Vec) SingleThreadRadixOrder.getSortedOXHeaderKey(water.rapids.SingleThreadRadixOrder.getSortedOXHeaderKey)

Example 59 with Vec

use of water.fvec.Vec in project h2o-3 by h2oai.

the class Session method end.

   * Normal session exit.  Returned Frames are fully deep-copied, and are responsibility of the caller to delete.
   * Returned Frames have their refcnts currently up by 1 (for the returned value itself).
public Val end(Val returning) {
    // Remove all temp frames
    Futures fs = new Futures();
    for (Frame fr : FRAMES.values()) {
        // Remove internal Vecs one by one
        fs = downRefCnt(fr, fs);
        // Shallow remove, internal Vecs removed 1-by-1
        DKV.remove(fr._key, fs);
    // No more temp frames
    // (disappearing) session.
    if (returning != null && returning.isFrame()) {
        Frame fr = returning.getFrame();
        Vec[] vecs = fr.vecs();
        for (int i = 0; i < vecs.length; i++) {
            // Returning frame has refcnt +1, lower it now; should go to zero internal refcnts.
            _addRefCnt(vecs[i], -1);
            if (// Copy if shared with globals
                fr.replace(i, vecs[i].makeCopy());
    // No longer tracking globals
    return returning;
Also used : Frame(water.fvec.Frame) Futures(water.Futures) Vec(water.fvec.Vec)

Example 60 with Vec

use of water.fvec.Vec in project h2o-3 by h2oai.

the class SingleThreadRadixOrder method compute2.

public void compute2() {
    keytmp = new byte[_keySize];
    counts = new long[_keySize][256];
    Key k;
    SplitByMSBLocal.MSBNodeHeader[] MSBnodeHeader = new SplitByMSBLocal.MSBNodeHeader[H2O.CLOUD.size()];
    long numRows = 0;
    for (int n = 0; n < H2O.CLOUD.size(); n++) {
        //"Getting MSB " + MSBvalue + " Node Header from node " + n + "/" + H2O.CLOUD.size() + " for Frame " + _fr._key);
        k = SplitByMSBLocal.getMSBNodeHeaderKey(_isLeft, _MSBvalue, n);
        MSBnodeHeader[n] = DKV.getGet(k);
        if (MSBnodeHeader[n] == null)
        // This numRows is split into nbatch batches on that node.
        numRows += ArrayUtils.sum(MSBnodeHeader[n]._MSBnodeChunkCounts);
    // This header has the counts of each chunk (the ordered chunk numbers on that node)
    if (numRows == 0) {
    // Allocate final _o and _x for this MSB which is gathered together on this
    // node from the other nodes.
    // TO DO: as Arno suggested, wrap up into class for fixed width batching
    // (to save espc overhead)
    // at least one batch.
    int nbatch = (int) ((numRows - 1) / _batchSize + 1);
    // the size of the last batch (could be batchSize, too if happens to be
    // exact multiple of batchSize)
    int lastSize = (int) (numRows - (nbatch - 1) * _batchSize);
    _o = new long[nbatch][];
    _x = new byte[nbatch][];
    int b;
    for (b = 0; b < nbatch - 1; b++) {
        // TO DO?: use MemoryManager.malloc8()
        _o[b] = new long[_batchSize];
        _x[b] = new byte[_batchSize * _keySize];
    _o[b] = new long[lastSize];
    _x[b] = new byte[lastSize * _keySize];
    SplitByMSBLocal.OXbatch[] ox = new SplitByMSBLocal.OXbatch[H2O.CLOUD.size()];
    // which batch of OX are we on from that node?  Initialized to 0.
    int[] oxBatchNum = new int[H2O.CLOUD.size()];
    for (int node = 0; node < H2O.CLOUD.size(); node++) {
        //TO DO: why is this serial?  Relying on
        k = SplitByMSBLocal.getNodeOXbatchKey(_isLeft, _MSBvalue, node, /*batch=*/
        // assert k.home();   // TODO: PUBDEV-3074
        // get the first batch for each node for this MSB
        ox[node] = DKV.getGet(k);
    int[] oxOffset = new int[H2O.CLOUD.size()];
    // that node has n chunks and which of those are we currently on?
    int[] oxChunkIdx = new int[H2O.CLOUD.size()];
    int targetBatch = 0, targetOffset = 0, targetBatchRemaining = _batchSize;
    final Vec vec = _fr.anyVec();
    assert vec != null;
    for (int c = 0; c < vec.nChunks(); c++) {
        // each chunk in the column may be on different nodes
        int fromNode = vec.chunkKey(c).home_node().index();
        // high value
        if (MSBnodeHeader[fromNode] == null)
        // magically this works, given the outer for loop through global
        // chunk.  Relies on LINE_ANCHOR_1 above.
        int numRowsToCopy = MSBnodeHeader[fromNode]._MSBnodeChunkCounts[oxChunkIdx[fromNode]++];
        // _MSBnodeChunkCounts is a vector of the number of contributions from
        // each Vec chunk.  Since each chunk is length int, this must less than
        // that, so int The set of data corresponding to the Vec chunk
        // contributions is stored packed in batched vectors _o and _x.
        // at most batchSize remaining.  No need to actually put the number of rows left in here
        int sourceBatchRemaining = _batchSize - oxOffset[fromNode];
        while (numRowsToCopy > 0) {
            // No need for class now, as this is a bit different to the other batch copier. Two isn't too bad.
            int thisCopy = Math.min(numRowsToCopy, Math.min(sourceBatchRemaining, targetBatchRemaining));
            System.arraycopy(ox[fromNode]._o, oxOffset[fromNode], _o[targetBatch], targetOffset, thisCopy);
            System.arraycopy(ox[fromNode]._x, oxOffset[fromNode] * _keySize, _x[targetBatch], targetOffset * _keySize, thisCopy * _keySize);
            numRowsToCopy -= thisCopy;
            oxOffset[fromNode] += thisCopy;
            sourceBatchRemaining -= thisCopy;
            targetOffset += thisCopy;
            targetBatchRemaining -= thisCopy;
            if (sourceBatchRemaining == 0) {
                // fetch the next batch :
                k = SplitByMSBLocal.getNodeOXbatchKey(_isLeft, _MSBvalue, fromNode, ++oxBatchNum[fromNode]);
                assert k.home();
                ox[fromNode] = DKV.getGet(k);
                if (ox[fromNode] == null) {
                    // if the last chunksworth fills a batchsize exactly, the getGet above will have returned null.
                    // TODO: Check will Cliff that a known fetch of a non-existent key is ok e.g. won't cause a delay/block? If ok, leave as good check.
                    int numNonZero = 0;
                    for (int tmp : MSBnodeHeader[fromNode]._MSBnodeChunkCounts) if (tmp > 0)
                    assert oxBatchNum[fromNode] == numNonZero;
                    assert ArrayUtils.sum(MSBnodeHeader[fromNode]._MSBnodeChunkCounts) % _batchSize == 0;
                oxOffset[fromNode] = 0;
                sourceBatchRemaining = _batchSize;
            if (targetBatchRemaining == 0) {
                targetOffset = 0;
                targetBatchRemaining = _batchSize;
    // We now have _o and _x collated from all the contributing nodes, in the correct original order.
    // TODO save this allocation and reuse per thread?  Or will heap just take care of it. Time this allocation and copy as step 1 anyway.
    _xtmp = new byte[_x.length][];
    _otmp = new long[_o.length][];
    // i.e. aligned batch size between x and o (think 20 bytes keys and 8 bytes of long in o)
    assert _x.length == _o.length;
    // maybe that needs target to be allocated first
    for (int i = 0; i < _x.length; i++) {
        _xtmp[i] = Arrays.copyOf(_x[i], _x[i].length);
        _otmp[i] = Arrays.copyOf(_o[i], _o[i].length);
    //        Perhaps iterating this task through the largest bins first will help java reuse heap.
    assert (_o != null);
    assert (numRows > 0);
    // The main work. Radix sort this batch ...
    // if keySize is 6 bytes, first byte is byte 5
    run(0, numRows, _keySize - 1);
    // don't need to clear these now using private transient
    // _counts = null;
    // keytmp = null;
    //_nGroup = null;
    // tell the world how many batches and rows for this MSB
    OXHeader msbh = new OXHeader(_o.length, numRows, _batchSize);
    Futures fs = new Futures();
    DKV.put(getSortedOXHeaderKey(_isLeft, _MSBvalue), msbh, fs, true);
    assert _o.length == _x.length;
    for (b = 0; b < _o.length; b++) {
        SplitByMSBLocal.OXbatch tmp = new SplitByMSBLocal.OXbatch(_o[b], _x[b]);
        Value v = new Value(SplitByMSBLocal.getSortedOXbatchKey(_isLeft, _MSBvalue, b), tmp);
        // the OXbatchKey's on this node will be reused for the new keys
        DKV.put(v._key, v, fs, true);
    // TODO: check numRows is the total of the _x[b] lengths
Also used : Vec(water.fvec.Vec)


Vec (water.fvec.Vec)280 Frame (water.fvec.Frame)213 Test (org.junit.Test)82 NFSFileVec (water.fvec.NFSFileVec)48 ValFrame (water.rapids.vals.ValFrame)47 Chunk (water.fvec.Chunk)30 Random (java.util.Random)25 NewChunk (water.fvec.NewChunk)23 DeepLearningParameters (hex.deeplearning.DeepLearningModel.DeepLearningParameters)22 Key (water.Key)21 MRTask (water.MRTask)17 Val (water.rapids.Val)14 File ( ArrayList (java.util.ArrayList)11 Futures (water.Futures)11 H2OIllegalArgumentException (water.exceptions.H2OIllegalArgumentException)11 ValNum (water.rapids.vals.ValNum)11 ShuffleSplitFrame (hex.splitframe.ShuffleSplitFrame)10 BufferedString (water.parser.BufferedString)10 AppendableVec (water.fvec.AppendableVec)9