Search in sources :

Example 41 with Chunk

use of water.fvec.Chunk in project h2o-3 by h2oai.

the class VecUtils method stringToCategorical.

/**
   * Create a new {@link Vec} of categorical values from string {@link Vec}.
   *
   * FIXME: implement in more efficient way with Brandon's primitives for BufferedString manipulation
   *
   * @param vec a string {@link Vec}
   * @return a categorical {@link Vec}
   */
public static Vec stringToCategorical(Vec vec) {
    final String[] vecDomain = new CollectStringVecDomain().domain(vec);
    MRTask task = new MRTask() {

        private transient java.util.HashMap<String, Integer> lookupTable;

        @Override
        protected void setupLocal() {
            lookupTable = new java.util.HashMap<>(vecDomain.length);
            for (int i = 0; i < vecDomain.length; i++) {
                // FIXME: boxing
                lookupTable.put(vecDomain[i], i);
            }
        }

        @Override
        public void map(Chunk c, NewChunk nc) {
            BufferedString bs = new BufferedString();
            for (int row = 0; row < c.len(); row++) {
                if (c.isNA(row)) {
                    nc.addNA();
                } else {
                    c.atStr(bs, row);
                    nc.addNum(lookupTable.get(bs.bytesToString()), 0);
                }
            }
        }
    };
    // Invoke tasks - one input vector, one ouput vector
    task.doAll(new byte[] { Vec.T_CAT }, vec);
    // Return result
    return task.outputFrame(null, null, new String[][] { vecDomain }).vec(0);
}
Also used : java.util(java.util) NonBlockingHashMap(water.nbhm.NonBlockingHashMap) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString) Chunk(water.fvec.Chunk) NewChunk(water.fvec.NewChunk) C0DChunk(water.fvec.C0DChunk) NewChunk(water.fvec.NewChunk)

Example 42 with Chunk

use of water.fvec.Chunk in project h2o-2 by h2oai.

the class Impute method serve.

@Override
protected Response serve() {
    if (init())
        return Inspect2.redirect(this, source._key.toString());
    final int col_id = source.find(column);
    final int[] _cols = group_by;
    final Key mykey = Key.make();
    try {
        if (group_by == null) {
            // just use "method" using the input "column"
            double _replace_val = 0;
            if (method == Method.mean) {
                _replace_val = column.mean();
            } else if (method == Method.median) {
                QuantilesPage qp = new QuantilesPage();
                qp.source_key = source;
                qp.column = column;
                qp.invoke();
                _replace_val = qp.result;
            } else if (method == Method.mode) {
                String[] dom = column.domain();
                long[][] levels = new long[1][];
                levels[0] = new Vec.CollectDomain(column).doAll(new Frame(column)).domain();
                long[][] counts = new ASTTable.Tabularize(levels).doAll(column)._counts;
                long maxCounts = -1;
                int mode = -1;
                for (int i = 0; i < counts[0].length; ++i) {
                    if (counts[0][i] > maxCounts && !dom[i].equals("NA")) {
                        // check for "NA" in domain -- corner case from R
                        maxCounts = counts[0][i];
                        mode = i;
                    }
                }
                _replace_val = mode != -1 ? (double) mode : // could produce -1 if "NA" not in the domain -- that is we don't have the R corner case
                (double) Arrays.asList(dom).indexOf("NA");
                // OK to replace, since we're in the elif "mode" block
                if (_replace_val == -1)
                    _replace_val = Double.NaN;
            }
            final double rv = _replace_val;
            new MRTask2() {

                @Override
                public void map(Chunk[] cs) {
                    Chunk c = cs[col_id];
                    int rows = c.len();
                    for (int r = 0; r < rows; ++r) {
                        if (c.isNA0(r) || (c._vec.isEnum() && c._vec.domain()[(int) c.at0(r)].equals("NA"))) {
                            // leave as NA if replace value is NA
                            if (!Double.isNaN(rv))
                                c.set0(r, rv);
                        }
                    }
                }
            }.doAll(source);
        } else {
            // collect the groups HashMap and the frame from the ddply.
            // create a vec of group IDs (each row is in some group)
            // MRTask over the rows
            water.exec.Exec2.exec(Key.make().toString() + " = anonymous <- function(x) \n{\n " + method + "(x[," + (col_id + 1) + "])\n}").remove_and_unlock();
            Env env = water.exec.Exec2.exec(mykey.toString() + " = ddply(" + source._key.toString() + ", " + toAryString(_cols) + ", anonymous)");
            final Frame grp_replacement = new Frame(env.peekAry());
            env.remove_and_unlock();
            Log.info("GROUP TASK NUM COLS: " + grp_replacement.numCols());
            final GroupTask grp2val = new GroupTask(grp_replacement.numCols() - 1).doAll(grp_replacement);
            new MRTask2() {

                @Override
                public void map(Chunk[] cs) {
                    Chunk c = cs[col_id];
                    int rows = cs[0].len();
                    for (int r = 0; r < rows; ++r) {
                        if (c.isNA0(r) || (c._vec.isEnum() && c._vec.domain()[(int) c.at0(r)].equals("NA"))) {
                            Group g = new Group(_cols.length);
                            g.fill(r, cs, _cols);
                            if (grp2val._grp2val.get(g) == null)
                                continue;
                            double rv = grp2val._grp2val.get(g);
                            c.set0(r, rv);
                        }
                    }
                }
            }.doAll(source);
        }
        return Inspect2.redirect(this, source._key.toString());
    } catch (Throwable t) {
        return Response.error(t);
    } finally {
        // Delete frames
        UKV.remove(mykey);
    }
}
Also used : Group(water.exec.ASTddply.Group) Frame(water.fvec.Frame) Chunk(water.fvec.Chunk) Env(water.exec.Env) Vec(water.fvec.Vec)

Example 43 with Chunk

use of water.fvec.Chunk in project h2o-2 by h2oai.

the class ChunkSummary method map.

@Override
public void map(Chunk[] cs) {
    chunk_counts = new long[chunkTypes.length];
    chunk_byte_sizes = new long[chunkTypes.length];
    byte_size_per_node = new long[H2O.CLOUD.size()];
    for (Chunk c : cs) {
        boolean found = false;
        for (int j = 0; j < chunkTypes.length; ++j) {
            if (c.getClass().getSimpleName().equals(chunkTypes[j] + "Chunk")) {
                found = true;
                chunk_counts[j]++;
                chunk_byte_sizes[j] += c.byteSize();
                byte_size_per_node[H2O.SELF.index()] += c.byteSize();
            }
        }
        if (!found) {
            throw H2O.unimpl();
        }
    }
}
Also used : Chunk(water.fvec.Chunk) PrettyPrint(water.PrettyPrint)

Example 44 with Chunk

use of water.fvec.Chunk in project h2o-3 by h2oai.

the class VecDataInputStream method fetchData.

private void fetchData(long position) {
    Chunk chk = _v.chunkForRow(position);
    _buffer = chk.asBytes();
    _offset = chk.start();
    _pos = (int) (position - _offset);
    assert _buffer.length > 0;
}
Also used : Chunk(water.fvec.Chunk)

Example 45 with Chunk

use of water.fvec.Chunk in project h2o-3 by h2oai.

the class VecDataInputStream method read.

@Override
public int read(final long position, byte[] buffer, int offset, int length) throws IOException {
    int loaded = 0;
    long currentPosition = position;
    while ((loaded < length) && (currentPosition < _v.length())) {
        byte[] buff;
        int pos;
        if (inBuffer(currentPosition)) {
            buff = _buffer;
            pos = (int) (currentPosition - _offset);
        } else {
            Chunk chunk = _v.chunkForRow(currentPosition);
            buff = chunk.asBytes();
            pos = (int) (currentPosition - chunk.start());
        }
        int avail = Math.min(buff.length - pos, length - loaded);
        System.arraycopy(buff, pos, buffer, offset + loaded, avail);
        loaded += avail;
        currentPosition += avail;
    }
    return loaded;
}
Also used : Chunk(water.fvec.Chunk)

Aggregations

Chunk (water.fvec.Chunk)74 Frame (water.fvec.Frame)50 NewChunk (water.fvec.NewChunk)36 MRTask (water.MRTask)33 Vec (water.fvec.Vec)30 ValFrame (water.rapids.vals.ValFrame)26 C0DChunk (water.fvec.C0DChunk)7 BufferedString (water.parser.BufferedString)7 Random (java.util.Random)6 Test (org.junit.Test)5 MRTask2 (water.MRTask2)4 Val (water.rapids.Val)4 Key (water.Key)3 H2OIllegalArgumentException (water.exceptions.H2OIllegalArgumentException)3 AstRoot (water.rapids.ast.AstRoot)3 AstNumList (water.rapids.ast.params.AstNumList)3 File (java.io.File)2 IOException (java.io.IOException)2 ValNum (water.rapids.vals.ValNum)2 PrettyPrint (water.util.PrettyPrint)2