Example 36 with MRTask

the class TransformWrappedVec method makeVec.

public Vec makeVec() {
    Vec v = new MRTask() {

        public void map(Chunk c, NewChunk nc) {
            c.extractRows(nc, 0, c._len);
    }.doAll(Vec.T_NUM, this).outputFrame().anyVec();
    return v;
Example 37 with MRTask

the class AstCorrelation method array.

// Matrix correlation.  Compute correlation between all columns from each Frame
// against each other.  Return a matrix of correlations which is frx.numCols
// wide and fry.numCols tall.
private Val array(Frame frx, Frame fry, Mode mode) {
    Vec[] vecxs = frx.vecs();
    int ncolx = vecxs.length;
    Vec[] vecys = fry.vecs();
    int ncoly = vecys.length;
    if (mode.equals(Mode.Everything) || mode.equals(Mode.AllObs)) {
        if (mode.equals(Mode.AllObs)) {
            for (Vec v : vecxs) if (v.naCnt() != 0)
                throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present");
        //Set up CoVarTask
        CoVarTask[] cvs = new CoVarTask[ncoly];
        //Get mean of x vecs
        double[] xmeans = new double[ncolx];
        for (int x = 0; x < ncolx; x++) {
            xmeans[x] = vecxs[x].mean();
        //Set up double arrays to capture sd(x), sd(y) and sd(x) * sd(y)
        double[] sigmay = new double[ncoly];
        double[] sigmax = new double[ncolx];
        double[][] denom = new double[ncoly][ncolx];
        // Launch tasks; each does all Xs vs one Y
        for (int y = 0; y < ncoly; y++) {
            //Get covariance between x and y
            cvs[y] = new CoVarTask(vecys[y].mean(), xmeans).dfork(new Frame(vecys[y]).add(frx));
            //Get sigma of y vecs
            sigmay[y] = vecys[y].sigma();
        //Get sigma of x vecs
        for (int x = 0; x < ncolx; x++) {
            sigmax[x] = vecxs[x].sigma();
        //Denominator for correlation calculation is sigma_y * sigma_x (All x sigmas vs one Y)
        for (int y = 0; y < ncoly; y++) {
            for (int x = 0; x < ncolx; x++) {
                denom[y][x] = sigmay[y] * sigmax[x];
        // 1-col returns scalar
        if (ncolx == 1 && ncoly == 1) {
            return new ValNum((cvs[0].getResult()._covs[0] / (fry.numRows() - 1)) / denom[0][0]);
        //Gather final result, which is the correlation coefficient per column
        Vec[] res = new Vec[ncoly];
        Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly);
        for (int y = 0; y < ncoly; y++) {
            res[y] = Vec.makeVec(ArrayUtils.div(ArrayUtils.div(cvs[y].getResult()._covs, (fry.numRows() - 1)), denom[y]), keys[y]);
        return new ValFrame(new Frame(fry._names, res));
    } else {
        //if (mode.equals(Mode.CompleteObs))
        //Omit NA rows between X and Y.
        //This will help with cov, sigma & mean calculations later as we only want to calculate cov, sigma, & mean
        //for rows with no NAs
        Frame frxy_naomit = new MRTask() {

            private void copyRow(int row, Chunk[] cs, NewChunk[] ncs) {
                for (int i = 0; i < cs.length; ++i) {
                    if (cs[i] instanceof CStrChunk)
                        ncs[i].addStr(cs[i], row);
                    else if (cs[i] instanceof C16Chunk)
                        ncs[i].addUUID(cs[i], row);
                    else if (cs[i].hasFloat())
                        ncs[i].addNum(cs[i].at8(row), 0);

            public void map(Chunk[] cs, NewChunk[] ncs) {
                int col;
                for (int row = 0; row < cs[0]._len; ++row) {
                    for (col = 0; col < cs.length; ++col) if (cs[col].isNA(row))
                    if (col == cs.length)
                        copyRow(row, cs, ncs);
        }.doAll(new Frame(frx).add(fry).types(), new Frame(frx).add(fry)).outputFrame(new Frame(frx).add(fry).names(), new Frame(frx).add(fry).domains());
        //Collect new vecs that do not contain NA rows
        Vec[] vecxs_naomit = frxy_naomit.subframe(0, ncolx).vecs();
        int ncolx_naomit = vecxs_naomit.length;
        Vec[] vecys_naomit = frxy_naomit.subframe(ncolx, frxy_naomit.vecs().length).vecs();
        int ncoly_naomit = vecys_naomit.length;
        //Set up CoVarTask
        CoVarTask[] cvs = new CoVarTask[ncoly_naomit];
        //Get mean of X vecs
        double[] xmeans = new double[ncolx_naomit];
        for (int x = 0; x < ncolx_naomit; x++) {
            xmeans[x] = vecxs_naomit[x].mean();
        //Set up double arrays to capture sd(x), sd(y) and sd(x) * sd(y)
        double[] sigmay = new double[ncoly_naomit];
        double[] sigmax = new double[ncolx_naomit];
        double[][] denom = new double[ncoly_naomit][ncolx_naomit];
        // Launch tasks; each does all Xs vs one Y
        for (int y = 0; y < ncoly_naomit; y++) {
            //Get covariance between x and y
            cvs[y] = new CoVarTask(vecys_naomit[y].mean(), xmeans).dfork(new Frame(vecys_naomit[y]).add(frxy_naomit.subframe(0, ncolx)));
            //Get sigma of y vecs
            sigmay[y] = vecys_naomit[y].sigma();
        //Get sigma of x vecs
        for (int x = 0; x < ncolx_naomit; x++) {
            sigmax[x] = vecxs_naomit[x].sigma();
        //Denominator for correlation calculation is sigma_y * sigma_x (All x sigmas vs one Y)
        for (int y = 0; y < ncoly_naomit; y++) {
            for (int x = 0; x < ncolx_naomit; x++) {
                denom[y][x] = sigmay[y] * sigmax[x];
        // 1-col returns scalar
        if (ncolx_naomit == 1 && ncoly_naomit == 1) {
            return new ValNum((cvs[0].getResult()._covs[0] / (frxy_naomit.numRows() - 1)) / denom[0][0]);
        //Gather final result, which is the correlation coefficient per column
        Vec[] res = new Vec[ncoly_naomit];
        Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly_naomit);
        for (int y = 0; y < ncoly_naomit; y++) {
            res[y] = Vec.makeVec(ArrayUtils.div(ArrayUtils.div(cvs[y].getResult()._covs, (frxy_naomit.numRows() - 1)), denom[y]), keys[y]);
        return new ValFrame(new Frame(frxy_naomit.subframe(ncolx, frxy_naomit.vecs().length)._names, res));
Example 38 with MRTask

the class AstImpute method apply.

// (h2o.impute data col method combine_method groupby groupByFrame values)
public Val apply(Env env, Env.StackHelp stk, AstRoot[] asts) {
    // Argument parsing and sanity checking
    // Whole frame being imputed
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    // Column within frame being imputed
    final int col = (int) asts[2].exec(env).getNum();
    if (col >= fr.numCols())
        throw new IllegalArgumentException("Column not -1 or in range 0 to " + fr.numCols());
    final boolean doAllVecs = col == -1;
    final Vec vec = doAllVecs ? null : fr.vec(col);
    // Technique used for imputation
    AstRoot method = null;
    boolean ffill0 = false, bfill0 = false;
    switch(asts[3].exec(env).getStr().toUpperCase()) {
        case "MEAN":
            method = new AstMean();
        case "MEDIAN":
            method = new AstMedian();
        case "MODE":
            method = new AstMode();
        case "FFILL":
            ffill0 = true;
        case "BFILL":
            bfill0 = true;
            throw new IllegalArgumentException("Method must be one of mean, median or mode");
    // Only for median, how is the median computed on even sample sizes?
    QuantileModel.CombineMethod combine = QuantileModel.CombineMethod.valueOf(asts[4].exec(env).getStr().toUpperCase());
    // Group-by columns.  Empty is allowed, and perfectly normal.
    AstRoot ast = asts[5];
    AstNumList by2;
    if (ast instanceof AstNumList)
        by2 = (AstNumList) ast;
    else if (ast instanceof AstNum)
        by2 = new AstNumList(((AstNum) ast).getNum());
    else if (ast instanceof AstStrList) {
        String[] names = ((AstStrList) ast)._strs;
        double[] list = new double[names.length];
        int i = 0;
        for (String name : ((AstStrList) ast)._strs) list[i++] = fr.find(name);
        by2 = new AstNumList(list);
    } else
        throw new IllegalArgumentException("Requires a number-list, but found a " + ast.getClass());
    Frame groupByFrame = asts[6].str().equals("_") ? null : stk.track(asts[6].exec(env)).getFrame();
    AstRoot vals = asts[7];
    AstNumList values;
    if (vals instanceof AstNumList)
        values = (AstNumList) vals;
    else if (vals instanceof AstNum)
        values = new AstNumList(((AstNum) vals).getNum());
        values = null;
    boolean doGrpBy = !by2.isEmpty() || groupByFrame != null;
    // Compute the imputed value per-group.  Empty groups are allowed and OK.
    IcedHashMap<AstGroup.G, Freezable[]> group_impute_map;
    if (!doGrpBy) {
        // Skip the grouping work
        if (ffill0 || bfill0) {
            // do a forward/backward fill on the NA
            // TODO: requires chk.previousNonNA and chk.nextNonNA style methods (which may go across chk boundaries)s
            final boolean ffill = ffill0;
            final boolean bfill = bfill0;
            throw H2O.unimpl("No ffill or bfill imputation supported");
        //        new MRTask() {
        //          @Override public void map(Chunk[] cs) {
        //            int len=cs[0]._len; // end of this chk
        //            long start=cs[0].start();  // absolute beginning of chk s.t. start-1 bleeds into previous chk
        //            long absEnd = start+len;   // absolute end of the chk s.t. absEnd+1 bleeds into next chk
        //            for(int c=0;c<cs.length;++c )
        //              for(int r=0;r<cs[0]._len;++r ) {
        //                if( cs[c].isNA(r) ) {
        //                  if( r > 0 && r < len-1 ) {
        //                    cs[c].set(r,ffill?)
        //                  }
        //                }
        //              }
        //          }
        //        }.doAll(doAllVecs?fr:new Frame(vec));
        //        return new ValNum(Double.NaN);
        } else {
            final double[] res = values == null ? new double[fr.numCols()] : values.expand();
            if (values == null) {
                // fill up res if no values supplied user, common case
                if (doAllVecs) {
                    for (int i = 0; i < res.length; ++i) if (fr.vec(i).isNumeric() || fr.vec(i).isCategorical())
                        res[i] = fr.vec(i).isNumeric() ? fr.vec(i).mean() : ArrayUtils.maxIndex(fr.vec(i).bins());
                } else {
                    Arrays.fill(res, Double.NaN);
                    if (method instanceof AstMean)
                        res[col] = vec.mean();
                    if (method instanceof AstMedian)
                        res[col] = AstMedian.median(new Frame(vec), combine);
                    if (method instanceof AstMode)
                        res[col] = AstMode.mode(vec);
            new MRTask() {

                public void map(Chunk[] cs) {
                    int len = cs[0]._len;
                    // run down each chk
                    for (int c = 0; c < cs.length; ++c) if (!Double.isNaN(res[c]))
                        for (int row = 0; row < len; ++row) if (cs[c].isNA(row))
                            cs[c].set(row, res[c]);
            return new ValNums(res);
    } else {
        if (col >= fr.numCols())
            throw new IllegalArgumentException("Column not -1 or in range 0 to " + fr.numCols());
        Frame imputes = groupByFrame;
        if (imputes == null) {
            // Build and run a GroupBy command
            AstGroup ast_grp = new AstGroup();
            // simple case where user specified a column... col == -1 means do all columns
            if (doAllVecs) {
                AstRoot[] aggs = new AstRoot[(int) (3 + 3 * (fr.numCols() - by2.cnt()))];
                aggs[0] = ast_grp;
                aggs[1] = new AstFrame(fr);
                aggs[2] = by2;
                int c = 3;
                for (int i = 0; i < fr.numCols(); ++i) {
                    if (!by2.has(i) && (fr.vec(i).isCategorical() || fr.vec(i).isNumeric())) {
                        aggs[c] = fr.vec(i).isNumeric() ? new AstMean() : new AstMode();
                        aggs[c + 1] = new AstNumList(i, i + 1);
                        aggs[c + 2] = new AstStr("rm");
                        c += 3;
                imputes = ast_grp.apply(env, stk, aggs).getFrame();
            } else
                imputes = ast_grp.apply(env, stk, new AstRoot[] { ast_grp, new AstFrame(fr), by2, /**/
                method, new AstNumList(col, col + 1), new AstStr("rm") }).getFrame();
        if (// >2 makes it ambiguous which columns are groupby cols and which are aggs, throw IAE
        by2.isEmpty() && imputes.numCols() > 2)
            throw new IllegalArgumentException("Ambiguous group-by frame. Supply the `by` columns to proceed.");
        final int[] bycols0 = ArrayUtils.seq(0, Math.max((int) by2.cnt(), 1));
        group_impute_map = new Gather(by2.expand4(), bycols0, fr.numCols(), col).doAll(imputes)._group_impute_map;
        // Now walk over the data, replace NAs with the imputed results
        final IcedHashMap<AstGroup.G, Freezable[]> final_group_impute_map = group_impute_map;
        if (by2.isEmpty()) {
            int[] byCols = new int[imputes.numCols() - 1];
            for (int i = 0; i < byCols.length; ++i) byCols[i] = fr.find(;
            by2 = new AstNumList(byCols);
        final int[] bycols = by2.expand4();
        new MRTask() {

            public void map(Chunk[] cs) {
                Set<Integer> _bycolz = new HashSet<>();
                for (int b : bycols) _bycolz.add(b);
                AstGroup.G g = new AstGroup.G(bycols.length, null);
                for (int row = 0; row < cs[0]._len; row++) for (int c = 0; c < cs.length; ++c) if (!_bycolz.contains(c))
                    if (cs[c].isNA(row))
                        cs[c].set(row, ((IcedDouble) final_group_impute_map.get(g.fill(row, cs, bycols))[c])._val);
        return new ValFrame(imputes);
Example 39 with MRTask

the class AstTable method fast_table.

// -------------------------------------------------------------------------
// Fast-path for 1 integer column
private ValFrame fast_table(Vec v1, int ncols, String colname) {
    if (ncols != 1 || !v1.isInt())
        return null;
    long spanl = (long) v1.max() - (long) v1.min() + 1;
    // Cap at decent array size, for performance
    if (spanl > 1000000)
        return null;
    // First fast-pass counting
    AstTable.FastCnt fastCnt = new AstTable.FastCnt((long) v1.min(), (int) spanl).doAll(v1);
    final long[] cnts = fastCnt._cnts;
    final long minVal = fastCnt._min;
    // Second pass to build the result frame, skipping zeros
    Vec dataLayoutVec = Vec.makeCon(0, cnts.length);
    Frame fr = new MRTask() {

        public void map(Chunk[] cs, NewChunk nc0, NewChunk nc1) {
            final Chunk c = cs[0];
            for (int i = 0; i < c._len; ++i) {
                int idx = (int) (i + c.start());
                if (cnts[idx] > 0) {
                    nc0.addNum(idx + minVal);
    }.doAll(new byte[] { Vec.T_NUM, Vec.T_NUM }, dataLayoutVec).outputFrame(new String[] { colname, "Count" }, new String[][] { v1.domain(), null });
    return new ValFrame(fr);
Example 40 with MRTask

the class DataInfoTestAdapt method checkFrame.

private void checkFrame(final DataInfo di, final Frame gold, final boolean skipMissing) {
    try {
        Vec[] vecs = new Vec[di._adaptedFrame.numCols() + gold.numCols()];
        System.arraycopy(di._adaptedFrame.vecs(), 0, vecs, 0, di._adaptedFrame.numCols());
        System.arraycopy(gold.vecs(), 0, vecs, di._adaptedFrame.numCols(), gold.numCols());
        new MRTask() {

            public void map(Chunk[] cs) {
                int off = di._adaptedFrame.numCols();
                DataInfo.Row r = di.newDenseRow();
                //          DataInfo.Row rows[] = di.extractSparseRows(cs);
                for (int i = 0; i < cs[0]._len; ++i) {
                    //            DataInfo.Row r = rows[i];
                    di.extractDenseRow(cs, i, r);
                    if (skipMissing && r.isBad())
                    for (int j = 0; j < di.fullN(); ++j) {
                        double goldValue = cs[off + j].atd(i);
                        // - (di._normSub[j - di.numStart()] * di._normMul[j-di.numStart()]);
                        double thisValue = r.get(j);
                        double diff = Math.abs(goldValue - thisValue);
                        if (diff > 1e-12) {
                            if (!skipMissing && diff < 10)
                                System.out.println("row mismatch: " + i + " column= " + j + "; diff= " + diff + " but not skipping missing, so due to discrepancies in taking mean on split frames");
                                throw new RuntimeException("bonk");
    } finally {
