Search in sources :

Example 26 with MRTask

use of water.MRTask in project h2o-3 by h2oai.

the class AstUniOp method exec.

@Override
public Val exec(Val... args) {
    Val val = args[1];
    switch(val.type()) {
        case Val.NUM:
            return new ValNum(op(val.getNum()));
        case Val.FRM:
            Frame fr = val.getFrame();
            for (int i = 0; i < fr.numCols(); i++) if (!fr.vec(i).isNumeric())
                throw new IllegalArgumentException("Operator " + str() + "() cannot be applied to non-numeric column " + fr.name(i));
            // Get length of columns in fr and append `op(colName)`. For example, a column named "income" that had
            // a log transformation would now be changed to `log(income)`.
            String[] newNames = new String[fr.numCols()];
            for (int i = 0; i < newNames.length; i++) {
                newNames[i] = str() + "(" + fr.name(i) + ")";
            }
            return new ValFrame(new MRTask() {

                @Override
                public void map(Chunk[] cs, NewChunk[] ncs) {
                    for (int col = 0; col < cs.length; col++) {
                        Chunk c = cs[col];
                        NewChunk nc = ncs[col];
                        for (int i = 0; i < c._len; i++) nc.addNum(op(c.atd(i)));
                    }
                }
            }.doAll(fr.numCols(), Vec.T_NUM, fr).outputFrame(newNames, null));
        case Val.ROW:
            double[] ds = new double[val.getRow().length];
            for (int i = 0; i < ds.length; ++i) ds[i] = op(val.getRow()[i]);
            String[] names = ((ValRow) val).getNames().clone();
            return new ValRow(ds, names);
        default:
            throw H2O.unimpl("unop unimpl: " + val.getClass());
    }
}
Also used : Val(water.rapids.Val) ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) ValNum(water.rapids.vals.ValNum) Chunk(water.fvec.Chunk) NewChunk(water.fvec.NewChunk) NewChunk(water.fvec.NewChunk) ValFrame(water.rapids.vals.ValFrame) ValRow(water.rapids.vals.ValRow) MRTask(water.MRTask)

Example 27 with MRTask

use of water.MRTask in project h2o-3 by h2oai.

the class AstHist method apply.

@Override
public ValFrame apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
    // stack is [ ..., ary, breaks]
    // handle the breaks
    Frame fr2;
    Frame f = stk.track(asts[1].exec(env)).getFrame();
    if (f.numCols() != 1)
        throw new IllegalArgumentException("Hist only applies to single numeric columns.");
    Vec vec = f.anyVec();
    if (!vec.isNumeric())
        throw new IllegalArgumentException("Hist only applies to single numeric columns.");
    //TODO Add case when vec is a constant numeric
    if (vec.isConst())
        throw new IllegalArgumentException("Hist does not apply to constant numeric columns.");
    AstRoot a = asts[2];
    String algo = null;
    int numBreaks = -1;
    double[] breaks = null;
    if (a instanceof AstStr)
        algo = a.str().toLowerCase();
    else if (a instanceof AstNumList)
        breaks = ((AstNumList) a).expand();
    else if (a instanceof AstNum)
        numBreaks = (int) a.exec(env).getNum();
    AstHist.HistTask t;
    double h;
    double x1 = vec.max();
    double x0 = vec.min();
    if (breaks != null)
        t = new AstHist.HistTask(breaks, -1, -1).doAll(vec);
    else if (algo != null) {
        switch(algo) {
            case "sturges":
                numBreaks = sturges(vec);
                h = (x1 - x0) / numBreaks;
                break;
            case "rice":
                numBreaks = rice(vec);
                h = (x1 - x0) / numBreaks;
                break;
            case "sqrt":
                numBreaks = sqrt(vec);
                h = (x1 - x0) / numBreaks;
                break;
            case "doane":
                numBreaks = doane(vec);
                h = (x1 - x0) / numBreaks;
                break;
            case "scott":
                h = scotts_h(vec);
                numBreaks = scott(vec, h);
                // special bin width computation
                break;
            case "fd":
                h = fds_h(vec);
                numBreaks = fd(vec, h);
                // special bin width computation
                break;
            default:
                numBreaks = sturges(vec);
                // just do sturges even if junk passed in
                h = (x1 - x0) / numBreaks;
        }
        t = new AstHist.HistTask(computeCuts(vec, numBreaks), h, x0).doAll(vec);
    } else {
        h = (x1 - x0) / numBreaks;
        t = new AstHist.HistTask(computeCuts(vec, numBreaks), h, x0).doAll(vec);
    }
    // wanna make a new frame here [breaks,counts,mids]
    final double[] brks = t._breaks;
    final long[] cnts = t._counts;
    final double[] mids_true = t._mids;
    final double[] mids = new double[t._breaks.length - 1];
    for (int i = 1; i < brks.length; ++i) mids[i - 1] = .5 * (t._breaks[i - 1] + t._breaks[i]);
    Vec layoutVec = Vec.makeZero(brks.length);
    fr2 = new MRTask() {

        @Override
        public void map(Chunk[] c, NewChunk[] nc) {
            int start = (int) c[0].start();
            for (int i = 0; i < c[0]._len; ++i) {
                nc[0].addNum(brks[i + start]);
                if (i == 0) {
                    nc[1].addNA();
                    nc[2].addNA();
                    nc[3].addNA();
                } else {
                    nc[1].addNum(cnts[(i - 1) + start]);
                    nc[2].addNum(mids_true[(i - 1) + start]);
                    nc[3].addNum(mids[(i - 1) + start]);
                }
            }
        }
    }.doAll(4, Vec.T_NUM, new Frame(layoutVec)).outputFrame(null, new String[] { "breaks", "counts", "mids_true", "mids" }, null);
    layoutVec.remove();
    return new ValFrame(fr2);
}
Also used : ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) Chunk(water.fvec.Chunk) NewChunk(water.fvec.NewChunk) NewChunk(water.fvec.NewChunk) ValFrame(water.rapids.vals.ValFrame) AstNum(water.rapids.ast.params.AstNum) Vec(water.fvec.Vec) AstStr(water.rapids.ast.params.AstStr) MRTask(water.MRTask) AstRoot(water.rapids.ast.AstRoot) AstNumList(water.rapids.ast.params.AstNumList)

Example 28 with MRTask

use of water.MRTask in project h2o-3 by h2oai.

the class AstKFold method stratifiedKFoldColumn.

public static Vec stratifiedKFoldColumn(Vec y, final int nfolds, final long seed) {
    // therefore, have a seed per class to be used by the map call
    if (!(y.isCategorical() || (y.isNumeric() && y.isInt())))
        throw new IllegalArgumentException("stratification only applies to integer and categorical columns. Got: " + y.get_type_str());
    final long[] classes = new VecUtils.CollectDomain().doAll(y).domain();
    final int nClass = y.isNumeric() ? classes.length : y.domain().length;
    // seed for each regular fold column (one per class)
    final long[] seeds = new long[nClass];
    for (int i = 0; i < nClass; ++i) seeds[i] = getRNG(seed + i).nextLong();
    return new MRTask() {

        private int getFoldId(long absoluteRow, long seed) {
            return Math.abs(getRNG(absoluteRow + seed).nextInt()) % nfolds;
        }

        // dress up the foldColumn (y[1]) as follows:
        //   1. For each testFold and each classLabel loop over the response column (y[0])
        //   2. If the classLabel is the current response and the testFold is the foldId
        //      for the current row and classLabel, then set the foldColumn to testFold
        //
        //   How this balances labels per fold:
        //      Imagine that a KFold column was generated for each class. Observe that this
        //      makes the outer loop a way of selecting only the test rows from each fold
        //      (i.e., the holdout rows). Each fold is balanced sequentially in this way
        //      since y[1] is only updated if the current row happens to be a holdout row
        //      for the given classLabel.
        //
        //      Next observe that looping over each classLabel filters down each KFold
        //      so that it contains labels for just THAT class. This is how the balancing
        //      can be made so that it is independent of the chunk distribution and the
        //      per chunk class distribution.
        //
        //      Downside is this performs nfolds*nClass passes over each Chunk. For
        //      "reasonable" classification problems, this could be 100 passes per Chunk.
        @Override
        public void map(Chunk[] y) {
            long start = y[0].start();
            for (int testFold = 0; testFold < nfolds; ++testFold) {
                for (int classLabel = 0; classLabel < nClass; ++classLabel) {
                    for (int row = 0; row < y[0]._len; ++row) {
                        // missing response gets spread around
                        if (y[0].isNA(row)) {
                            if ((start + row) % nfolds == testFold)
                                y[1].set(row, testFold);
                        } else {
                            if (y[0].at8(row) == (classes == null ? classLabel : classes[classLabel])) {
                                if (testFold == getFoldId(start + row, seeds[classLabel]))
                                    y[1].set(row, testFold);
                            }
                        }
                    }
                }
            }
        }
    }.doAll(new Frame(y, y.makeZero()))._fr.vec(1);
}
Also used : ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) MRTask(water.MRTask) Chunk(water.fvec.Chunk)

Example 29 with MRTask

use of water.MRTask in project h2o-3 by h2oai.

the class AstUnique method apply.

@Override
public ValFrame apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    Vec vec0 = fr.vec(0);
    Vec v;
    if (fr.numCols() != 1)
        throw new IllegalArgumentException("Unique applies to a single column only.");
    if (vec0.isCategorical()) {
        v = Vec.makeSeq(0, (long) vec0.domain().length, true);
        v.setDomain(vec0.domain());
        DKV.put(v);
    } else {
        UniqTask t = new UniqTask().doAll(fr);
        int nUniq = t._uniq.size();
        final AstGroup.G[] uniq = t._uniq.keySet().toArray(new AstGroup.G[nUniq]);
        v = Vec.makeZero(nUniq, vec0.get_type());
        new MRTask() {

            @Override
            public void map(Chunk c) {
                int start = (int) c.start();
                for (int i = 0; i < c._len; ++i) c.set(i, uniq[i + start]._gs[0]);
            }
        }.doAll(v);
    }
    return new ValFrame(new Frame(v));
}
Also used : ValFrame(water.rapids.vals.ValFrame) ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) AstGroup(water.rapids.ast.prims.mungers.AstGroup) Vec(water.fvec.Vec) MRTask(water.MRTask) Chunk(water.fvec.Chunk)

Example 30 with MRTask

use of water.MRTask in project h2o-3 by h2oai.

the class AstApply method rowwise.

// --------------------------------------------------------------------------
// Break each row into it's own Row, then execute the function passing the
// 1 argument.  All rows are independent, and run in parallel
private ValFrame rowwise(Env env, Frame fr, final AstPrimitive fun) {
    final String[] names = fr._names;
    // Current execution scope; needed to lookup variables
    final AstFunction scope = env._scope;
    // do a single row of the frame to determine the size of the output.
    double[] ds = new double[fr.numCols()];
    for (int col = 0; col < fr.numCols(); ++col) ds[col] = fr.vec(col).at(0);
    int noutputs = fun.apply(env, env.stk(), new AstRoot[] { fun, new AstRow(ds, fr.names()) }).getRow().length;
    Frame res = new MRTask() {

        @Override
        public void map(Chunk[] chks, NewChunk[] nc) {
            // Working row
            double[] ds = new double[chks.length];
            // Arguments to be called; they are reused endlessly
            AstRoot[] asts = new AstRoot[] { fun, new AstRow(ds, names) };
            // Session, again reused endlessly
            Session ses = new Session();
            Env env = new Env(ses);
            // For proper namespace lookup
            env._scope = scope;
            for (int row = 0; row < chks[0]._len; row++) {
                for (// Fill the row
                int col = 0; // Fill the row
                col < chks.length; // Fill the row
                col++) ds[col] = chks[col].atd(row);
                try (Env.StackHelp stk_inner = env.stk()) {
                    // Make the call per-row
                    double[] valRow = fun.apply(env, stk_inner, asts).getRow();
                    for (int newCol = 0; newCol < nc.length; ++newCol) nc[newCol].addNum(valRow[newCol]);
                }
            }
            // Mostly for the sanity checks
            ses.end(null);
        }
    }.doAll(noutputs, Vec.T_NUM, fr).outputFrame();
    return new ValFrame(res);
}
Also used : ValFrame(water.rapids.vals.ValFrame) ValFrame(water.rapids.vals.ValFrame) MRTask(water.MRTask) AstRoot(water.rapids.ast.AstRoot)

Aggregations

MRTask (water.MRTask)55 ValFrame (water.rapids.vals.ValFrame)37 Chunk (water.fvec.Chunk)33 Frame (water.fvec.Frame)33 NewChunk (water.fvec.NewChunk)23 Vec (water.fvec.Vec)17 BufferedString (water.parser.BufferedString)9 ValNum (water.rapids.vals.ValNum)6 Val (water.rapids.Val)5 AstRoot (water.rapids.ast.AstRoot)4 AstNumList (water.rapids.ast.params.AstNumList)4 Key (water.Key)3 Test (org.junit.Test)2 Futures (water.Futures)2 AstNum (water.rapids.ast.params.AstNum)2 AstStr (water.rapids.ast.params.AstStr)2 AstStrList (water.rapids.ast.params.AstStrList)2 AstGroup (water.rapids.ast.prims.mungers.AstGroup)2 ValRow (water.rapids.vals.ValRow)2 DataInfo (hex.DataInfo)1