Search in sources :

Example 36 with MRTask

use of water.MRTask in project h2o-3 by h2oai.

the class TransformWrappedVec method makeVec.

public Vec makeVec() {
    Vec v = new MRTask() {

        @Override
        public void map(Chunk c, NewChunk nc) {
            c.extractRows(nc, 0, c._len);
        }
    }.doAll(Vec.T_NUM, this).outputFrame().anyVec();
    remove();
    return v;
}
Also used : MRTask(water.MRTask)

Example 37 with MRTask

use of water.MRTask in project h2o-3 by h2oai.

the class AstCorrelation method array.

// Matrix correlation.  Compute correlation between all columns from each Frame
// against each other.  Return a matrix of correlations which is frx.numCols
// wide and fry.numCols tall.
private Val array(Frame frx, Frame fry, Mode mode) {
    Vec[] vecxs = frx.vecs();
    int ncolx = vecxs.length;
    Vec[] vecys = fry.vecs();
    int ncoly = vecys.length;
    if (mode.equals(Mode.Everything) || mode.equals(Mode.AllObs)) {
        if (mode.equals(Mode.AllObs)) {
            for (Vec v : vecxs) if (v.naCnt() != 0)
                throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present");
        }
        //Set up CoVarTask
        CoVarTask[] cvs = new CoVarTask[ncoly];
        //Get mean of x vecs
        double[] xmeans = new double[ncolx];
        for (int x = 0; x < ncolx; x++) {
            xmeans[x] = vecxs[x].mean();
        }
        //Set up double arrays to capture sd(x), sd(y) and sd(x) * sd(y)
        double[] sigmay = new double[ncoly];
        double[] sigmax = new double[ncolx];
        double[][] denom = new double[ncoly][ncolx];
        // Launch tasks; each does all Xs vs one Y
        for (int y = 0; y < ncoly; y++) {
            //Get covariance between x and y
            cvs[y] = new CoVarTask(vecys[y].mean(), xmeans).dfork(new Frame(vecys[y]).add(frx));
            //Get sigma of y vecs
            sigmay[y] = vecys[y].sigma();
        }
        //Get sigma of x vecs
        for (int x = 0; x < ncolx; x++) {
            sigmax[x] = vecxs[x].sigma();
        }
        //Denominator for correlation calculation is sigma_y * sigma_x (All x sigmas vs one Y)
        for (int y = 0; y < ncoly; y++) {
            for (int x = 0; x < ncolx; x++) {
                denom[y][x] = sigmay[y] * sigmax[x];
            }
        }
        // 1-col returns scalar
        if (ncolx == 1 && ncoly == 1) {
            return new ValNum((cvs[0].getResult()._covs[0] / (fry.numRows() - 1)) / denom[0][0]);
        }
        //Gather final result, which is the correlation coefficient per column
        Vec[] res = new Vec[ncoly];
        Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly);
        for (int y = 0; y < ncoly; y++) {
            res[y] = Vec.makeVec(ArrayUtils.div(ArrayUtils.div(cvs[y].getResult()._covs, (fry.numRows() - 1)), denom[y]), keys[y]);
        }
        return new ValFrame(new Frame(fry._names, res));
    } else {
        //if (mode.equals(Mode.CompleteObs))
        //Omit NA rows between X and Y.
        //This will help with cov, sigma & mean calculations later as we only want to calculate cov, sigma, & mean
        //for rows with no NAs
        Frame frxy_naomit = new MRTask() {

            private void copyRow(int row, Chunk[] cs, NewChunk[] ncs) {
                for (int i = 0; i < cs.length; ++i) {
                    if (cs[i] instanceof CStrChunk)
                        ncs[i].addStr(cs[i], row);
                    else if (cs[i] instanceof C16Chunk)
                        ncs[i].addUUID(cs[i], row);
                    else if (cs[i].hasFloat())
                        ncs[i].addNum(cs[i].atd(row));
                    else
                        ncs[i].addNum(cs[i].at8(row), 0);
                }
            }

            @Override
            public void map(Chunk[] cs, NewChunk[] ncs) {
                int col;
                for (int row = 0; row < cs[0]._len; ++row) {
                    for (col = 0; col < cs.length; ++col) if (cs[col].isNA(row))
                        break;
                    if (col == cs.length)
                        copyRow(row, cs, ncs);
                }
            }
        }.doAll(new Frame(frx).add(fry).types(), new Frame(frx).add(fry)).outputFrame(new Frame(frx).add(fry).names(), new Frame(frx).add(fry).domains());
        //Collect new vecs that do not contain NA rows
        Vec[] vecxs_naomit = frxy_naomit.subframe(0, ncolx).vecs();
        int ncolx_naomit = vecxs_naomit.length;
        Vec[] vecys_naomit = frxy_naomit.subframe(ncolx, frxy_naomit.vecs().length).vecs();
        int ncoly_naomit = vecys_naomit.length;
        //Set up CoVarTask
        CoVarTask[] cvs = new CoVarTask[ncoly_naomit];
        //Get mean of X vecs
        double[] xmeans = new double[ncolx_naomit];
        for (int x = 0; x < ncolx_naomit; x++) {
            xmeans[x] = vecxs_naomit[x].mean();
        }
        //Set up double arrays to capture sd(x), sd(y) and sd(x) * sd(y)
        double[] sigmay = new double[ncoly_naomit];
        double[] sigmax = new double[ncolx_naomit];
        double[][] denom = new double[ncoly_naomit][ncolx_naomit];
        // Launch tasks; each does all Xs vs one Y
        for (int y = 0; y < ncoly_naomit; y++) {
            //Get covariance between x and y
            cvs[y] = new CoVarTask(vecys_naomit[y].mean(), xmeans).dfork(new Frame(vecys_naomit[y]).add(frxy_naomit.subframe(0, ncolx)));
            //Get sigma of y vecs
            sigmay[y] = vecys_naomit[y].sigma();
        }
        //Get sigma of x vecs
        for (int x = 0; x < ncolx_naomit; x++) {
            sigmax[x] = vecxs_naomit[x].sigma();
        }
        //Denominator for correlation calculation is sigma_y * sigma_x (All x sigmas vs one Y)
        for (int y = 0; y < ncoly_naomit; y++) {
            for (int x = 0; x < ncolx_naomit; x++) {
                denom[y][x] = sigmay[y] * sigmax[x];
            }
        }
        // 1-col returns scalar
        if (ncolx_naomit == 1 && ncoly_naomit == 1) {
            return new ValNum((cvs[0].getResult()._covs[0] / (frxy_naomit.numRows() - 1)) / denom[0][0]);
        }
        //Gather final result, which is the correlation coefficient per column
        Vec[] res = new Vec[ncoly_naomit];
        Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly_naomit);
        for (int y = 0; y < ncoly_naomit; y++) {
            res[y] = Vec.makeVec(ArrayUtils.div(ArrayUtils.div(cvs[y].getResult()._covs, (frxy_naomit.numRows() - 1)), denom[y]), keys[y]);
        }
        return new ValFrame(new Frame(frxy_naomit.subframe(ncolx, frxy_naomit.vecs().length)._names, res));
    }
}
Also used : ValFrame(water.rapids.vals.ValFrame) ValNum(water.rapids.vals.ValNum) ValFrame(water.rapids.vals.ValFrame) MRTask(water.MRTask) Key(water.Key)

Example 38 with MRTask

use of water.MRTask in project h2o-3 by h2oai.

the class AstImpute method apply.

// (h2o.impute data col method combine_method groupby groupByFrame values)
@Override
public Val apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
    // Argument parsing and sanity checking
    // Whole frame being imputed
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    // Column within frame being imputed
    final int col = (int) asts[2].exec(env).getNum();
    if (col >= fr.numCols())
        throw new IllegalArgumentException("Column not -1 or in range 0 to " + fr.numCols());
    final boolean doAllVecs = col == -1;
    final Vec vec = doAllVecs ? null : fr.vec(col);
    // Technique used for imputation
    AstRoot method = null;
    boolean ffill0 = false, bfill0 = false;
    switch(asts[3].exec(env).getStr().toUpperCase()) {
        case "MEAN":
            method = new AstMean();
            break;
        case "MEDIAN":
            method = new AstMedian();
            break;
        case "MODE":
            method = new AstMode();
            break;
        case "FFILL":
            ffill0 = true;
            break;
        case "BFILL":
            bfill0 = true;
            break;
        default:
            throw new IllegalArgumentException("Method must be one of mean, median or mode");
    }
    // Only for median, how is the median computed on even sample sizes?
    QuantileModel.CombineMethod combine = QuantileModel.CombineMethod.valueOf(asts[4].exec(env).getStr().toUpperCase());
    // Group-by columns.  Empty is allowed, and perfectly normal.
    AstRoot ast = asts[5];
    AstNumList by2;
    if (ast instanceof AstNumList)
        by2 = (AstNumList) ast;
    else if (ast instanceof AstNum)
        by2 = new AstNumList(((AstNum) ast).getNum());
    else if (ast instanceof AstStrList) {
        String[] names = ((AstStrList) ast)._strs;
        double[] list = new double[names.length];
        int i = 0;
        for (String name : ((AstStrList) ast)._strs) list[i++] = fr.find(name);
        Arrays.sort(list);
        by2 = new AstNumList(list);
    } else
        throw new IllegalArgumentException("Requires a number-list, but found a " + ast.getClass());
    Frame groupByFrame = asts[6].str().equals("_") ? null : stk.track(asts[6].exec(env)).getFrame();
    AstRoot vals = asts[7];
    AstNumList values;
    if (vals instanceof AstNumList)
        values = (AstNumList) vals;
    else if (vals instanceof AstNum)
        values = new AstNumList(((AstNum) vals).getNum());
    else
        values = null;
    boolean doGrpBy = !by2.isEmpty() || groupByFrame != null;
    // Compute the imputed value per-group.  Empty groups are allowed and OK.
    IcedHashMap<AstGroup.G, Freezable[]> group_impute_map;
    if (!doGrpBy) {
        // Skip the grouping work
        if (ffill0 || bfill0) {
            // do a forward/backward fill on the NA
            // TODO: requires chk.previousNonNA and chk.nextNonNA style methods (which may go across chk boundaries)s
            final boolean ffill = ffill0;
            final boolean bfill = bfill0;
            throw H2O.unimpl("No ffill or bfill imputation supported");
        //        new MRTask() {
        //          @Override public void map(Chunk[] cs) {
        //            int len=cs[0]._len; // end of this chk
        //            long start=cs[0].start();  // absolute beginning of chk s.t. start-1 bleeds into previous chk
        //            long absEnd = start+len;   // absolute end of the chk s.t. absEnd+1 bleeds into next chk
        //            for(int c=0;c<cs.length;++c )
        //              for(int r=0;r<cs[0]._len;++r ) {
        //                if( cs[c].isNA(r) ) {
        //                  if( r > 0 && r < len-1 ) {
        //                    cs[c].set(r,ffill?)
        //                  }
        //                }
        //              }
        //          }
        //        }.doAll(doAllVecs?fr:new Frame(vec));
        //        return new ValNum(Double.NaN);
        } else {
            final double[] res = values == null ? new double[fr.numCols()] : values.expand();
            if (values == null) {
                // fill up res if no values supplied user, common case
                if (doAllVecs) {
                    for (int i = 0; i < res.length; ++i) if (fr.vec(i).isNumeric() || fr.vec(i).isCategorical())
                        res[i] = fr.vec(i).isNumeric() ? fr.vec(i).mean() : ArrayUtils.maxIndex(fr.vec(i).bins());
                } else {
                    Arrays.fill(res, Double.NaN);
                    if (method instanceof AstMean)
                        res[col] = vec.mean();
                    if (method instanceof AstMedian)
                        res[col] = AstMedian.median(new Frame(vec), combine);
                    if (method instanceof AstMode)
                        res[col] = AstMode.mode(vec);
                }
            }
            new MRTask() {

                @Override
                public void map(Chunk[] cs) {
                    int len = cs[0]._len;
                    // run down each chk
                    for (int c = 0; c < cs.length; ++c) if (!Double.isNaN(res[c]))
                        for (int row = 0; row < len; ++row) if (cs[c].isNA(row))
                            cs[c].set(row, res[c]);
                }
            }.doAll(fr);
            return new ValNums(res);
        }
    } else {
        if (col >= fr.numCols())
            throw new IllegalArgumentException("Column not -1 or in range 0 to " + fr.numCols());
        Frame imputes = groupByFrame;
        if (imputes == null) {
            // Build and run a GroupBy command
            AstGroup ast_grp = new AstGroup();
            // simple case where user specified a column... col == -1 means do all columns
            if (doAllVecs) {
                AstRoot[] aggs = new AstRoot[(int) (3 + 3 * (fr.numCols() - by2.cnt()))];
                aggs[0] = ast_grp;
                aggs[1] = new AstFrame(fr);
                aggs[2] = by2;
                int c = 3;
                for (int i = 0; i < fr.numCols(); ++i) {
                    if (!by2.has(i) && (fr.vec(i).isCategorical() || fr.vec(i).isNumeric())) {
                        aggs[c] = fr.vec(i).isNumeric() ? new AstMean() : new AstMode();
                        aggs[c + 1] = new AstNumList(i, i + 1);
                        aggs[c + 2] = new AstStr("rm");
                        c += 3;
                    }
                }
                imputes = ast_grp.apply(env, stk, aggs).getFrame();
            } else
                imputes = ast_grp.apply(env, stk, new AstRoot[] { ast_grp, new AstFrame(fr), by2, /**/
                method, new AstNumList(col, col + 1), new AstStr("rm") }).getFrame();
        }
        if (// >2 makes it ambiguous which columns are groupby cols and which are aggs, throw IAE
        by2.isEmpty() && imputes.numCols() > 2)
            throw new IllegalArgumentException("Ambiguous group-by frame. Supply the `by` columns to proceed.");
        final int[] bycols0 = ArrayUtils.seq(0, Math.max((int) by2.cnt(), 1));
        group_impute_map = new Gather(by2.expand4(), bycols0, fr.numCols(), col).doAll(imputes)._group_impute_map;
        // Now walk over the data, replace NAs with the imputed results
        final IcedHashMap<AstGroup.G, Freezable[]> final_group_impute_map = group_impute_map;
        if (by2.isEmpty()) {
            int[] byCols = new int[imputes.numCols() - 1];
            for (int i = 0; i < byCols.length; ++i) byCols[i] = fr.find(imputes.name(i));
            by2 = new AstNumList(byCols);
        }
        final int[] bycols = by2.expand4();
        new MRTask() {

            @Override
            public void map(Chunk[] cs) {
                Set<Integer> _bycolz = new HashSet<>();
                for (int b : bycols) _bycolz.add(b);
                AstGroup.G g = new AstGroup.G(bycols.length, null);
                for (int row = 0; row < cs[0]._len; row++) for (int c = 0; c < cs.length; ++c) if (!_bycolz.contains(c))
                    if (cs[c].isNA(row))
                        cs[c].set(row, ((IcedDouble) final_group_impute_map.get(g.fill(row, cs, bycols))[c])._val);
            }
        }.doAll(fr);
        return new ValFrame(imputes);
    }
}
Also used : AstStrList(water.rapids.ast.params.AstStrList) IcedDouble(water.util.IcedDouble) Frame(water.fvec.Frame) ValFrame(water.rapids.vals.ValFrame) AstFrame(water.rapids.ast.AstFrame) HashSet(java.util.HashSet) Set(java.util.Set) ValFrame(water.rapids.vals.ValFrame) AstGroup(water.rapids.ast.prims.mungers.AstGroup) MRTask(water.MRTask) AstStr(water.rapids.ast.params.AstStr) AstRoot(water.rapids.ast.AstRoot) AstNumList(water.rapids.ast.params.AstNumList) Chunk(water.fvec.Chunk) QuantileModel(hex.quantile.QuantileModel) AstMedian(water.rapids.ast.prims.reducers.AstMedian) AstNum(water.rapids.ast.params.AstNum) AstFrame(water.rapids.ast.AstFrame) Vec(water.fvec.Vec) ValNums(water.rapids.vals.ValNums) AstMean(water.rapids.ast.prims.reducers.AstMean)

Example 39 with MRTask

use of water.MRTask in project h2o-3 by h2oai.

the class AstTable method fast_table.

// -------------------------------------------------------------------------
// Fast-path for 1 integer column
private ValFrame fast_table(Vec v1, int ncols, String colname) {
    if (ncols != 1 || !v1.isInt())
        return null;
    long spanl = (long) v1.max() - (long) v1.min() + 1;
    // Cap at decent array size, for performance
    if (spanl > 1000000)
        return null;
    // First fast-pass counting
    AstTable.FastCnt fastCnt = new AstTable.FastCnt((long) v1.min(), (int) spanl).doAll(v1);
    final long[] cnts = fastCnt._cnts;
    final long minVal = fastCnt._min;
    // Second pass to build the result frame, skipping zeros
    Vec dataLayoutVec = Vec.makeCon(0, cnts.length);
    Frame fr = new MRTask() {

        @Override
        public void map(Chunk[] cs, NewChunk nc0, NewChunk nc1) {
            final Chunk c = cs[0];
            for (int i = 0; i < c._len; ++i) {
                int idx = (int) (i + c.start());
                if (cnts[idx] > 0) {
                    nc0.addNum(idx + minVal);
                    nc1.addNum(cnts[idx]);
                }
            }
        }
    }.doAll(new byte[] { Vec.T_NUM, Vec.T_NUM }, dataLayoutVec).outputFrame(new String[] { colname, "Count" }, new String[][] { v1.domain(), null });
    dataLayoutVec.remove();
    return new ValFrame(fr);
}
Also used : ValFrame(water.rapids.vals.ValFrame) ValFrame(water.rapids.vals.ValFrame) Frame(water.fvec.Frame) Vec(water.fvec.Vec) MRTask(water.MRTask) Chunk(water.fvec.Chunk) NewChunk(water.fvec.NewChunk) NewChunk(water.fvec.NewChunk)

Example 40 with MRTask

use of water.MRTask in project h2o-3 by h2oai.

the class DataInfoTestAdapt method checkFrame.

private void checkFrame(final DataInfo di, final Frame gold, final boolean skipMissing) {
    try {
        Vec[] vecs = new Vec[di._adaptedFrame.numCols() + gold.numCols()];
        System.arraycopy(di._adaptedFrame.vecs(), 0, vecs, 0, di._adaptedFrame.numCols());
        System.arraycopy(gold.vecs(), 0, vecs, di._adaptedFrame.numCols(), gold.numCols());
        new MRTask() {

            @Override
            public void map(Chunk[] cs) {
                int off = di._adaptedFrame.numCols();
                DataInfo.Row r = di.newDenseRow();
                //          DataInfo.Row rows[] = di.extractSparseRows(cs);
                for (int i = 0; i < cs[0]._len; ++i) {
                    //            DataInfo.Row r = rows[i];
                    di.extractDenseRow(cs, i, r);
                    if (skipMissing && r.isBad())
                        continue;
                    for (int j = 0; j < di.fullN(); ++j) {
                        double goldValue = cs[off + j].atd(i);
                        // - (di._normSub[j - di.numStart()] * di._normMul[j-di.numStart()]);
                        double thisValue = r.get(j);
                        double diff = Math.abs(goldValue - thisValue);
                        if (diff > 1e-12) {
                            if (!skipMissing && diff < 10)
                                System.out.println("row mismatch: " + i + " column= " + j + "; diff= " + diff + " but not skipping missing, so due to discrepancies in taking mean on split frames");
                            else
                                throw new RuntimeException("bonk");
                        }
                    }
                }
            }
        }.doAll(vecs);
    } finally {
        di.dropInteractions();
        di.remove();
    }
}
Also used : Vec(water.fvec.Vec) MRTask(water.MRTask) Chunk(water.fvec.Chunk)

Aggregations

MRTask (water.MRTask)55 ValFrame (water.rapids.vals.ValFrame)37 Chunk (water.fvec.Chunk)33 Frame (water.fvec.Frame)33 NewChunk (water.fvec.NewChunk)23 Vec (water.fvec.Vec)17 BufferedString (water.parser.BufferedString)9 ValNum (water.rapids.vals.ValNum)6 Val (water.rapids.Val)5 AstRoot (water.rapids.ast.AstRoot)4 AstNumList (water.rapids.ast.params.AstNumList)4 Key (water.Key)3 Test (org.junit.Test)2 Futures (water.Futures)2 AstNum (water.rapids.ast.params.AstNum)2 AstStr (water.rapids.ast.params.AstStr)2 AstStrList (water.rapids.ast.params.AstStrList)2 AstGroup (water.rapids.ast.prims.mungers.AstGroup)2 ValRow (water.rapids.vals.ValRow)2 DataInfo (hex.DataInfo)1