Search in sources :

Example 36 with Chunk

use of water.fvec.Chunk in project h2o-3 by h2oai.

the class ShuffleSplitFrameTest method testShuffleSplitWithMultipleColumns.

@Test
public /* this test makes sure that the rows of the split frames are preserved (including UUID) */
void testShuffleSplitWithMultipleColumns() {
    long[] chunkLayout = ar(2L, 2L, 3L);
    String[][] data = ar(ar("1", "2"), ar(null, "3"), ar("4", "5", "6"));
    Frame f = null;
    Frame tmpFrm = createFrame("test1.hex", chunkLayout, data);
    try {
        f = new MRTask() {

            @Override
            public void map(Chunk[] cs, NewChunk[] ncs) {
                for (int i = 0; i < cs[0]._len; i++) {
                    BufferedString bs = cs[0].atStr(new BufferedString(), i);
                    int val = bs == null ? 0 : Integer.parseInt(bs.toString());
                    ncs[0].addStr(bs);
                    ncs[1].addNum(val);
                    ncs[2].addNum(i);
                    ncs[3].addUUID(i, val);
                }
            }
        }.doAll(new byte[] { Vec.T_STR, Vec.T_NUM, Vec.T_NUM, Vec.T_UUID }, tmpFrm).outputFrame();
    } finally {
        tmpFrm.delete();
    }
    testScenario(f, flat(data), new MRTask() {

        @Override
        public void map(Chunk[] cs) {
            for (int i = 0; i < cs[0]._len; i++) {
                BufferedString bs = cs[0].atStr(new BufferedString(), i);
                int expectedVal = bs == null ? 0 : Integer.parseInt(bs.toString());
                int expectedIndex = (int) cs[2].atd(i);
                Assert.assertEquals((double) expectedVal, cs[1].atd(i), 0.00001);
                Assert.assertEquals(expectedIndex, (int) cs[3].at16l(i));
                Assert.assertEquals(expectedVal, (int) cs[3].at16h(i));
            }
        }
    });
}
Also used : FrameTestUtil.createFrame(water.fvec.FrameTestUtil.createFrame) Frame(water.fvec.Frame) MRTask(water.MRTask) BufferedString(water.parser.BufferedString) Chunk(water.fvec.Chunk) NewChunk(water.fvec.NewChunk) NewChunk(water.fvec.NewChunk) Test(org.junit.Test)

Example 37 with Chunk

use of water.fvec.Chunk in project h2o-3 by h2oai.

the class AstUniOp method exec.

@Override
public Val exec(Val... args) {
    Val val = args[1];
    switch(val.type()) {
        case Val.NUM:
            return new ValNum(op(val.getNum()));
        case Val.FRM:
            Frame fr = val.getFrame();
            for (int i = 0; i < fr.numCols(); i++) if (!fr.vec(i).isNumeric())
                throw new IllegalArgumentException("Operator " + str() + "() cannot be applied to non-numeric column " + fr.name(i));
            // Get length of columns in fr and append `op(colName)`. For example, a column named "income" that had
            // a log transformation would now be changed to `log(income)`.
            String[] newNames = new String[fr.numCols()];
            for (int i = 0; i < newNames.length; i++) {
                newNames[i] = str() + "(" + fr.name(i) + ")";
            }
            return new ValFrame(new MRTask() {

                @Override
                public void map(Chunk[] cs, NewChunk[] ncs) {
                    for (int col = 0; col < cs.length; col++) {
                        Chunk c = cs[col];
                        NewChunk nc = ncs[col];
                        for (int i = 0; i < c._len; i++) nc.addNum(op(c.atd(i)));
                    }
                }
            }.doAll(fr.numCols(), Vec.T_NUM, fr).outputFrame(newNames, null));
        case Val.ROW:
            double[] ds = new double[val.getRow().length];
            for (int i = 0; i < ds.length; ++i) ds[i] = op(val.getRow()[i]);
            String[] names = ((ValRow) val).getNames().clone();
            return new ValRow(ds, names);
        default:
            throw H2O.unimpl("unop unimpl: " + val.getClass());
    }
}
Also used : Val(water.rapids.Val) ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) ValNum(water.rapids.vals.ValNum) Chunk(water.fvec.Chunk) NewChunk(water.fvec.NewChunk) NewChunk(water.fvec.NewChunk) ValFrame(water.rapids.vals.ValFrame) ValRow(water.rapids.vals.ValRow) MRTask(water.MRTask)

Example 38 with Chunk

use of water.fvec.Chunk in project h2o-3 by h2oai.

the class AstHist method apply.

@Override
public ValFrame apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
    // stack is [ ..., ary, breaks]
    // handle the breaks
    Frame fr2;
    Frame f = stk.track(asts[1].exec(env)).getFrame();
    if (f.numCols() != 1)
        throw new IllegalArgumentException("Hist only applies to single numeric columns.");
    Vec vec = f.anyVec();
    if (!vec.isNumeric())
        throw new IllegalArgumentException("Hist only applies to single numeric columns.");
    //TODO Add case when vec is a constant numeric
    if (vec.isConst())
        throw new IllegalArgumentException("Hist does not apply to constant numeric columns.");
    AstRoot a = asts[2];
    String algo = null;
    int numBreaks = -1;
    double[] breaks = null;
    if (a instanceof AstStr)
        algo = a.str().toLowerCase();
    else if (a instanceof AstNumList)
        breaks = ((AstNumList) a).expand();
    else if (a instanceof AstNum)
        numBreaks = (int) a.exec(env).getNum();
    AstHist.HistTask t;
    double h;
    double x1 = vec.max();
    double x0 = vec.min();
    if (breaks != null)
        t = new AstHist.HistTask(breaks, -1, -1).doAll(vec);
    else if (algo != null) {
        switch(algo) {
            case "sturges":
                numBreaks = sturges(vec);
                h = (x1 - x0) / numBreaks;
                break;
            case "rice":
                numBreaks = rice(vec);
                h = (x1 - x0) / numBreaks;
                break;
            case "sqrt":
                numBreaks = sqrt(vec);
                h = (x1 - x0) / numBreaks;
                break;
            case "doane":
                numBreaks = doane(vec);
                h = (x1 - x0) / numBreaks;
                break;
            case "scott":
                h = scotts_h(vec);
                numBreaks = scott(vec, h);
                // special bin width computation
                break;
            case "fd":
                h = fds_h(vec);
                numBreaks = fd(vec, h);
                // special bin width computation
                break;
            default:
                numBreaks = sturges(vec);
                // just do sturges even if junk passed in
                h = (x1 - x0) / numBreaks;
        }
        t = new AstHist.HistTask(computeCuts(vec, numBreaks), h, x0).doAll(vec);
    } else {
        h = (x1 - x0) / numBreaks;
        t = new AstHist.HistTask(computeCuts(vec, numBreaks), h, x0).doAll(vec);
    }
    // wanna make a new frame here [breaks,counts,mids]
    final double[] brks = t._breaks;
    final long[] cnts = t._counts;
    final double[] mids_true = t._mids;
    final double[] mids = new double[t._breaks.length - 1];
    for (int i = 1; i < brks.length; ++i) mids[i - 1] = .5 * (t._breaks[i - 1] + t._breaks[i]);
    Vec layoutVec = Vec.makeZero(brks.length);
    fr2 = new MRTask() {

        @Override
        public void map(Chunk[] c, NewChunk[] nc) {
            int start = (int) c[0].start();
            for (int i = 0; i < c[0]._len; ++i) {
                nc[0].addNum(brks[i + start]);
                if (i == 0) {
                    nc[1].addNA();
                    nc[2].addNA();
                    nc[3].addNA();
                } else {
                    nc[1].addNum(cnts[(i - 1) + start]);
                    nc[2].addNum(mids_true[(i - 1) + start]);
                    nc[3].addNum(mids[(i - 1) + start]);
                }
            }
        }
    }.doAll(4, Vec.T_NUM, new Frame(layoutVec)).outputFrame(null, new String[] { "breaks", "counts", "mids_true", "mids" }, null);
    layoutVec.remove();
    return new ValFrame(fr2);
}
Also used : ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) Chunk(water.fvec.Chunk) NewChunk(water.fvec.NewChunk) NewChunk(water.fvec.NewChunk) ValFrame(water.rapids.vals.ValFrame) AstNum(water.rapids.ast.params.AstNum) Vec(water.fvec.Vec) AstStr(water.rapids.ast.params.AstStr) MRTask(water.MRTask) AstRoot(water.rapids.ast.AstRoot) AstNumList(water.rapids.ast.params.AstNumList)

Example 39 with Chunk

use of water.fvec.Chunk in project h2o-3 by h2oai.

the class AstKFold method stratifiedKFoldColumn.

public static Vec stratifiedKFoldColumn(Vec y, final int nfolds, final long seed) {
    // therefore, have a seed per class to be used by the map call
    if (!(y.isCategorical() || (y.isNumeric() && y.isInt())))
        throw new IllegalArgumentException("stratification only applies to integer and categorical columns. Got: " + y.get_type_str());
    final long[] classes = new VecUtils.CollectDomain().doAll(y).domain();
    final int nClass = y.isNumeric() ? classes.length : y.domain().length;
    // seed for each regular fold column (one per class)
    final long[] seeds = new long[nClass];
    for (int i = 0; i < nClass; ++i) seeds[i] = getRNG(seed + i).nextLong();
    return new MRTask() {

        private int getFoldId(long absoluteRow, long seed) {
            return Math.abs(getRNG(absoluteRow + seed).nextInt()) % nfolds;
        }

        // dress up the foldColumn (y[1]) as follows:
        //   1. For each testFold and each classLabel loop over the response column (y[0])
        //   2. If the classLabel is the current response and the testFold is the foldId
        //      for the current row and classLabel, then set the foldColumn to testFold
        //
        //   How this balances labels per fold:
        //      Imagine that a KFold column was generated for each class. Observe that this
        //      makes the outer loop a way of selecting only the test rows from each fold
        //      (i.e., the holdout rows). Each fold is balanced sequentially in this way
        //      since y[1] is only updated if the current row happens to be a holdout row
        //      for the given classLabel.
        //
        //      Next observe that looping over each classLabel filters down each KFold
        //      so that it contains labels for just THAT class. This is how the balancing
        //      can be made so that it is independent of the chunk distribution and the
        //      per chunk class distribution.
        //
        //      Downside is this performs nfolds*nClass passes over each Chunk. For
        //      "reasonable" classification problems, this could be 100 passes per Chunk.
        @Override
        public void map(Chunk[] y) {
            long start = y[0].start();
            for (int testFold = 0; testFold < nfolds; ++testFold) {
                for (int classLabel = 0; classLabel < nClass; ++classLabel) {
                    for (int row = 0; row < y[0]._len; ++row) {
                        // missing response gets spread around
                        if (y[0].isNA(row)) {
                            if ((start + row) % nfolds == testFold)
                                y[1].set(row, testFold);
                        } else {
                            if (y[0].at8(row) == (classes == null ? classLabel : classes[classLabel])) {
                                if (testFold == getFoldId(start + row, seeds[classLabel]))
                                    y[1].set(row, testFold);
                            }
                        }
                    }
                }
            }
        }
    }.doAll(new Frame(y, y.makeZero()))._fr.vec(1);
}
Also used : ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) MRTask(water.MRTask) Chunk(water.fvec.Chunk)

Example 40 with Chunk

use of water.fvec.Chunk in project h2o-3 by h2oai.

the class AstUnique method apply.

@Override
public ValFrame apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    Vec vec0 = fr.vec(0);
    Vec v;
    if (fr.numCols() != 1)
        throw new IllegalArgumentException("Unique applies to a single column only.");
    if (vec0.isCategorical()) {
        v = Vec.makeSeq(0, (long) vec0.domain().length, true);
        v.setDomain(vec0.domain());
        DKV.put(v);
    } else {
        UniqTask t = new UniqTask().doAll(fr);
        int nUniq = t._uniq.size();
        final AstGroup.G[] uniq = t._uniq.keySet().toArray(new AstGroup.G[nUniq]);
        v = Vec.makeZero(nUniq, vec0.get_type());
        new MRTask() {

            @Override
            public void map(Chunk c) {
                int start = (int) c.start();
                for (int i = 0; i < c._len; ++i) c.set(i, uniq[i + start]._gs[0]);
            }
        }.doAll(v);
    }
    return new ValFrame(new Frame(v));
}
Also used : ValFrame(water.rapids.vals.ValFrame) ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) AstGroup(water.rapids.ast.prims.mungers.AstGroup) Vec(water.fvec.Vec) MRTask(water.MRTask) Chunk(water.fvec.Chunk)

Aggregations

Chunk (water.fvec.Chunk)74 Frame (water.fvec.Frame)50 NewChunk (water.fvec.NewChunk)36 MRTask (water.MRTask)33 Vec (water.fvec.Vec)30 ValFrame (water.rapids.vals.ValFrame)26 C0DChunk (water.fvec.C0DChunk)7 BufferedString (water.parser.BufferedString)7 Random (java.util.Random)6 Test (org.junit.Test)5 MRTask2 (water.MRTask2)4 Val (water.rapids.Val)4 Key (water.Key)3 H2OIllegalArgumentException (water.exceptions.H2OIllegalArgumentException)3 AstRoot (water.rapids.ast.AstRoot)3 AstNumList (water.rapids.ast.params.AstNumList)3 File (java.io.File)2 IOException (java.io.IOException)2 ValNum (water.rapids.vals.ValNum)2 PrettyPrint (water.util.PrettyPrint)2