Search in sources :

Example 21 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class Word2VecMojoWriter method writeModelData.

@Override
protected void writeModelData() throws IOException {
    writekv("vec_size", model._parms._vec_size);
    writekv("vocab_size", model._output._words.length);
    // Vocabulary
    startWritingTextFile("vocabulary");
    for (BufferedString word : model._output._words) {
        writeln(word.toString(), true);
    }
    finishWritingTextFile();
    // Vectors
    ByteBuffer bb = ByteBuffer.wrap(MemoryManager.malloc1(model._output._vecs.length * 4));
    for (float v : model._output._vecs) bb.putFloat(v);
    writeblob("vectors", bb.array());
}
Also used : BufferedString(water.parser.BufferedString) ByteBuffer(java.nio.ByteBuffer)

Example 22 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class WordCountTask method read_impl.

public final WordCountTask read_impl(AutoBuffer ab) {
    _counts = new IcedHashMap<>();
    int len;
    while ((len = ab.get2()) != 65535) {
        // Read until end-of-map marker
        byte[] bs = ab.getA1(len);
        long cnt = ab.get8();
        _counts.put(new BufferedString(new String(bs)), new IcedLong(cnt));
    }
    return this;
}
Also used : IcedLong(water.util.IcedLong) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString)

Example 23 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class ExternalFrameReaderBackend method handleReadingFromChunk.

/**
     * Internal method use on the h2o backend side to handle reading from the chunk from non-h2o environment
     * @param channel socket channel originating from non-h2o node
     * @param initAb {@link AutoBuffer} containing information necessary for preparing backend for reading
     */
static void handleReadingFromChunk(ByteChannel channel, AutoBuffer initAb) throws IOException {
    // receive required information
    String frameKey = initAb.getStr();
    int chunkIdx = initAb.getInt();
    byte[] expectedTypes = initAb.getA1();
    assert expectedTypes != null : "Expected types can't be null";
    int[] selectedColumnIndices = initAb.getA4();
    assert selectedColumnIndices != null : "Selected column indices can't be null";
    Frame fr = DKV.getGet(frameKey);
    Chunk[] chunks = ChunkUtils.getChunks(fr, chunkIdx);
    // write number of rows
    AutoBuffer ab = new AutoBuffer();
    ab.putInt(chunks[0]._len);
    writeToChannel(ab, channel);
    // buffered string to be reused for strings to avoid multiple allocation in the loop
    BufferedString valStr = new BufferedString();
    for (int rowIdx = 0; rowIdx < chunks[0]._len; rowIdx++) {
        for (int i = 0; i < selectedColumnIndices.length; i++) {
            if (chunks[selectedColumnIndices[i]].isNA(rowIdx)) {
                ExternalFrameUtils.sendNA(ab, channel, expectedTypes[i]);
            } else {
                final Chunk chnk = chunks[selectedColumnIndices[i]];
                switch(expectedTypes[i]) {
                    case EXPECTED_BOOL:
                        ExternalFrameUtils.sendBoolean(ab, channel, (byte) chnk.at8(rowIdx));
                        break;
                    case EXPECTED_BYTE:
                        ExternalFrameUtils.sendByte(ab, channel, (byte) chnk.at8(rowIdx));
                        break;
                    case EXPECTED_CHAR:
                        ExternalFrameUtils.sendChar(ab, channel, (char) chnk.at8(rowIdx));
                        break;
                    case EXPECTED_SHORT:
                        ExternalFrameUtils.sendShort(ab, channel, (short) chnk.at8(rowIdx));
                        break;
                    case EXPECTED_INT:
                        ExternalFrameUtils.sendInt(ab, channel, (int) chnk.at8(rowIdx));
                        break;
                    case EXPECTED_FLOAT:
                        ExternalFrameUtils.sendFloat(ab, channel, (float) chnk.atd(rowIdx));
                        break;
                    case EXPECTED_LONG:
                        ExternalFrameUtils.sendLong(ab, channel, chnk.at8(rowIdx));
                        break;
                    case EXPECTED_DOUBLE:
                        ExternalFrameUtils.sendDouble(ab, channel, chnk.atd(rowIdx));
                        break;
                    case EXPECTED_TIMESTAMP:
                        ExternalFrameUtils.sendTimestamp(ab, channel, chnk.at8(rowIdx));
                        break;
                    case EXPECTED_STRING:
                        if (chnk.vec().isCategorical()) {
                            ExternalFrameUtils.sendString(ab, channel, chnk.vec().domain()[(int) chnk.at8(rowIdx)]);
                        } else if (chnk.vec().isString()) {
                            ExternalFrameUtils.sendString(ab, channel, chnk.atStr(valStr, rowIdx).toString());
                        } else if (chnk.vec().isUUID()) {
                            UUID uuid = new UUID(chnk.at16h(rowIdx), chnk.at16l(rowIdx));
                            ExternalFrameUtils.sendString(ab, channel, uuid.toString());
                        } else {
                            assert false : "Can never be here";
                        }
                        break;
                }
            }
        }
    }
    ab.put1(ExternalFrameHandler.CONFIRM_READING_DONE);
    writeToChannel(ab, channel);
}
Also used : Frame(water.fvec.Frame) BufferedString(water.parser.BufferedString) BufferedString(water.parser.BufferedString) Chunk(water.fvec.Chunk) UUID(java.util.UUID)

Example 24 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class MRUtils method sampleFrame.

/**
   * Sample rows from a frame.
   * Can be unlucky for small sampling fractions - will continue calling itself until at least 1 row is returned.
   * @param fr Input frame
   * @param rows Approximate number of rows to sample (across all chunks)
   * @param seed Seed for RNG
   * @return Sampled frame
   */
public static Frame sampleFrame(Frame fr, final long rows, final long seed) {
    if (fr == null)
        return null;
    final float fraction = rows > 0 ? (float) rows / fr.numRows() : 1.f;
    if (fraction >= 1.f)
        return fr;
    Key newKey = fr._key != null ? Key.make(fr._key.toString() + (fr._key.toString().contains("temporary") ? ".sample." : ".temporary.sample.") + PrettyPrint.formatPct(fraction).replace(" ", "")) : null;
    Frame r = new MRTask() {

        @Override
        public void map(Chunk[] cs, NewChunk[] ncs) {
            final Random rng = getRNG(0);
            final BufferedString bStr = new BufferedString();
            int count = 0;
            for (int r = 0; r < cs[0]._len; r++) {
                rng.setSeed(seed + r + cs[0].start());
                if (rng.nextFloat() < fraction || (count == 0 && r == cs[0]._len - 1)) {
                    count++;
                    for (int i = 0; i < ncs.length; i++) {
                        if (cs[i].isNA(r))
                            ncs[i].addNA();
                        else if (cs[i] instanceof CStrChunk)
                            ncs[i].addStr(cs[i].atStr(bStr, r));
                        else if (cs[i] instanceof C16Chunk)
                            ncs[i].addUUID(cs[i].at16l(r), cs[i].at16h(r));
                        else
                            ncs[i].addNum(cs[i].atd(r));
                    }
                }
            }
        }
    }.doAll(fr.types(), fr).outputFrame(newKey, fr.names(), fr.domains());
    if (r.numRows() == 0) {
        Log.warn("You asked for " + rows + " rows (out of " + fr.numRows() + "), but you got none (seed=" + seed + ").");
        Log.warn("Let's try again. You've gotta ask yourself a question: \"Do I feel lucky?\"");
        return sampleFrame(fr, rows, seed + 1);
    }
    return r;
}
Also used : Random(java.util.Random) BufferedString(water.parser.BufferedString)

Example 25 with BufferedString

use of water.parser.BufferedString in project h2o-3 by h2oai.

the class ExternalFrameWriterClientTest method testWriting.

@Test
public void testWriting() throws IOException {
    final String[] nodes = new String[H2O.CLOUD._memary.length];
    // get ip and ports of h2o nodes
    for (int i = 0; i < nodes.length; i++) {
        nodes[i] = H2O.CLOUD._memary[i].getIpPortString();
    }
    // we will open 2 connection per h2o node
    final String[] connStrings = ArrayUtils.join(nodes, nodes);
    // The api expects that empty frame has to be in the DKV before we start working with it
    final String frameName = "fr";
    String[] colNames = { "NUM", "BOOL", "STR", "TIMESTAMP" };
    // vector types are inferred from expected types
    final byte[] expectedTypes = ExternalFrameUtils.prepareExpectedTypes(new Class[] { Integer.class, Boolean.class, String.class, Timestamp.class });
    ChunkUtils.initFrame(frameName, colNames);
    // number of chunks will be number of h2o nodes
    final long[] rowsPerChunk = new long[connStrings.length];
    Thread[] threads = new Thread[connStrings.length];
    // open all connections in connStrings array
    for (int idx = 0; idx < connStrings.length; idx++) {
        final int currentIndex = idx;
        threads[idx] = new Thread() {

            @Override
            public void run() {
                try {
                    ByteChannel sock = ExternalFrameUtils.getConnection(connStrings[currentIndex]);
                    ExternalFrameWriterClient writer = new ExternalFrameWriterClient(sock);
                    writer.createChunks(frameName, expectedTypes, currentIndex, 1000);
                    Timestamp time = new Timestamp(Calendar.getInstance().getTime().getTime());
                    for (int i = 0; i < 997; i++) {
                        writer.sendInt(i);
                        writer.sendBoolean(true);
                        writer.sendString("str_" + i);
                        writer.sendTimestamp(time);
                    }
                    writer.sendInt(0);
                    writer.sendBoolean(true);
                    writer.sendString(null);
                    writer.sendTimestamp(time);
                    writer.sendInt(1);
                    writer.sendBoolean(true);
                    writer.sendString("€");
                    writer.sendTimestamp(time);
                    // send NA for all columns
                    writer.sendNA();
                    writer.sendNA();
                    writer.sendNA();
                    writer.sendNA();
                    writer.waitUntilAllWritten();
                    sock.close();
                    rowsPerChunk[currentIndex] = 1000;
                } catch (IOException ignore) {
                }
            }
        };
        threads[idx].start();
    }
    // wait for all writer thread to finish
    for (Thread t : threads) {
        try {
            t.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
    ChunkUtils.finalizeFrame(frameName, rowsPerChunk, ExternalFrameUtils.vecTypesFromExpectedTypes(expectedTypes), null);
    Frame frame = null;
    try {
        frame = DKV.getGet(frameName);
        assertEquals(frame.anyVec().nChunks(), connStrings.length);
        assertEquals(frame._names.length, 4);
        assertEquals(frame.numCols(), 4);
        assertEquals(frame._names[0], "NUM");
        assertEquals(frame._names[1], "BOOL");
        assertEquals(frame._names[2], "STR");
        assertEquals(frame._names[3], "TIMESTAMP");
        assertEquals(frame.vec(0).get_type(), Vec.T_NUM);
        assertEquals(frame.vec(1).get_type(), Vec.T_NUM);
        assertEquals(frame.vec(2).get_type(), Vec.T_STR);
        assertEquals(frame.vec(3).get_type(), Vec.T_TIME);
        assertEquals(frame.numRows(), 1000 * connStrings.length);
        // last row should be NA
        assertEquals(frame.vec(0).at8(0), 0);
        BufferedString buff = new BufferedString();
        assertEquals(frame.vec(2).atStr(buff, 996).toString(), "str_996");
        assertEquals(frame.vec(2).atStr(buff, 997), null);
        assertEquals(frame.vec(2).atStr(buff, 998).toString(), "€");
        assertTrue(frame.vec(0).isNA(999));
        assertTrue(frame.vec(1).isNA(999));
        assertTrue(frame.vec(2).isNA(999));
        assertTrue(frame.vec(3).isNA(999));
    } finally {
        if (frame != null) {
            frame.remove();
        }
    }
}
Also used : Frame(water.fvec.Frame) BufferedString(water.parser.BufferedString) IOException(java.io.IOException) Timestamp(java.sql.Timestamp) ByteChannel(java.nio.channels.ByteChannel) BufferedString(water.parser.BufferedString) Test(org.junit.Test)

Aggregations

BufferedString (water.parser.BufferedString)43 Frame (water.fvec.Frame)12 Test (org.junit.Test)9 MRTask (water.MRTask)8 Vec (water.fvec.Vec)8 Chunk (water.fvec.Chunk)7 NewChunk (water.fvec.NewChunk)6 ValFrame (water.rapids.vals.ValFrame)5 IcedLong (water.util.IcedLong)5 IOException (java.io.IOException)2 ByteBuffer (java.nio.ByteBuffer)2 Random (java.util.Random)2 DateTimeFormatter (org.joda.time.format.DateTimeFormatter)2 TestFrameBuilder (water.fvec.TestFrameBuilder)2 BackendModel (deepwater.backends.BackendModel)1 BackendParams (deepwater.backends.BackendParams)1 RuntimeOptions (deepwater.backends.RuntimeOptions)1 ImageDataSet (deepwater.datasets.ImageDataSet)1 GenModel (hex.genmodel.GenModel)1 EasyPredictModelWrapper (hex.genmodel.easy.EasyPredictModelWrapper)1