use of water.parser.BufferedString in project h2o-3 by h2oai.
the class Word2VecMojoWriter method writeModelData.
@Override
protected void writeModelData() throws IOException {
writekv("vec_size", model._parms._vec_size);
writekv("vocab_size", model._output._words.length);
// Vocabulary
startWritingTextFile("vocabulary");
for (BufferedString word : model._output._words) {
writeln(word.toString(), true);
}
finishWritingTextFile();
// Vectors
ByteBuffer bb = ByteBuffer.wrap(MemoryManager.malloc1(model._output._vecs.length * 4));
for (float v : model._output._vecs) bb.putFloat(v);
writeblob("vectors", bb.array());
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class WordCountTask method read_impl.
public final WordCountTask read_impl(AutoBuffer ab) {
_counts = new IcedHashMap<>();
int len;
while ((len = ab.get2()) != 65535) {
// Read until end-of-map marker
byte[] bs = ab.getA1(len);
long cnt = ab.get8();
_counts.put(new BufferedString(new String(bs)), new IcedLong(cnt));
}
return this;
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class ExternalFrameReaderBackend method handleReadingFromChunk.
/**
* Internal method use on the h2o backend side to handle reading from the chunk from non-h2o environment
* @param channel socket channel originating from non-h2o node
* @param initAb {@link AutoBuffer} containing information necessary for preparing backend for reading
*/
static void handleReadingFromChunk(ByteChannel channel, AutoBuffer initAb) throws IOException {
// receive required information
String frameKey = initAb.getStr();
int chunkIdx = initAb.getInt();
byte[] expectedTypes = initAb.getA1();
assert expectedTypes != null : "Expected types can't be null";
int[] selectedColumnIndices = initAb.getA4();
assert selectedColumnIndices != null : "Selected column indices can't be null";
Frame fr = DKV.getGet(frameKey);
Chunk[] chunks = ChunkUtils.getChunks(fr, chunkIdx);
// write number of rows
AutoBuffer ab = new AutoBuffer();
ab.putInt(chunks[0]._len);
writeToChannel(ab, channel);
// buffered string to be reused for strings to avoid multiple allocation in the loop
BufferedString valStr = new BufferedString();
for (int rowIdx = 0; rowIdx < chunks[0]._len; rowIdx++) {
for (int i = 0; i < selectedColumnIndices.length; i++) {
if (chunks[selectedColumnIndices[i]].isNA(rowIdx)) {
ExternalFrameUtils.sendNA(ab, channel, expectedTypes[i]);
} else {
final Chunk chnk = chunks[selectedColumnIndices[i]];
switch(expectedTypes[i]) {
case EXPECTED_BOOL:
ExternalFrameUtils.sendBoolean(ab, channel, (byte) chnk.at8(rowIdx));
break;
case EXPECTED_BYTE:
ExternalFrameUtils.sendByte(ab, channel, (byte) chnk.at8(rowIdx));
break;
case EXPECTED_CHAR:
ExternalFrameUtils.sendChar(ab, channel, (char) chnk.at8(rowIdx));
break;
case EXPECTED_SHORT:
ExternalFrameUtils.sendShort(ab, channel, (short) chnk.at8(rowIdx));
break;
case EXPECTED_INT:
ExternalFrameUtils.sendInt(ab, channel, (int) chnk.at8(rowIdx));
break;
case EXPECTED_FLOAT:
ExternalFrameUtils.sendFloat(ab, channel, (float) chnk.atd(rowIdx));
break;
case EXPECTED_LONG:
ExternalFrameUtils.sendLong(ab, channel, chnk.at8(rowIdx));
break;
case EXPECTED_DOUBLE:
ExternalFrameUtils.sendDouble(ab, channel, chnk.atd(rowIdx));
break;
case EXPECTED_TIMESTAMP:
ExternalFrameUtils.sendTimestamp(ab, channel, chnk.at8(rowIdx));
break;
case EXPECTED_STRING:
if (chnk.vec().isCategorical()) {
ExternalFrameUtils.sendString(ab, channel, chnk.vec().domain()[(int) chnk.at8(rowIdx)]);
} else if (chnk.vec().isString()) {
ExternalFrameUtils.sendString(ab, channel, chnk.atStr(valStr, rowIdx).toString());
} else if (chnk.vec().isUUID()) {
UUID uuid = new UUID(chnk.at16h(rowIdx), chnk.at16l(rowIdx));
ExternalFrameUtils.sendString(ab, channel, uuid.toString());
} else {
assert false : "Can never be here";
}
break;
}
}
}
}
ab.put1(ExternalFrameHandler.CONFIRM_READING_DONE);
writeToChannel(ab, channel);
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class MRUtils method sampleFrame.
/**
* Sample rows from a frame.
* Can be unlucky for small sampling fractions - will continue calling itself until at least 1 row is returned.
* @param fr Input frame
* @param rows Approximate number of rows to sample (across all chunks)
* @param seed Seed for RNG
* @return Sampled frame
*/
public static Frame sampleFrame(Frame fr, final long rows, final long seed) {
if (fr == null)
return null;
final float fraction = rows > 0 ? (float) rows / fr.numRows() : 1.f;
if (fraction >= 1.f)
return fr;
Key newKey = fr._key != null ? Key.make(fr._key.toString() + (fr._key.toString().contains("temporary") ? ".sample." : ".temporary.sample.") + PrettyPrint.formatPct(fraction).replace(" ", "")) : null;
Frame r = new MRTask() {
@Override
public void map(Chunk[] cs, NewChunk[] ncs) {
final Random rng = getRNG(0);
final BufferedString bStr = new BufferedString();
int count = 0;
for (int r = 0; r < cs[0]._len; r++) {
rng.setSeed(seed + r + cs[0].start());
if (rng.nextFloat() < fraction || (count == 0 && r == cs[0]._len - 1)) {
count++;
for (int i = 0; i < ncs.length; i++) {
if (cs[i].isNA(r))
ncs[i].addNA();
else if (cs[i] instanceof CStrChunk)
ncs[i].addStr(cs[i].atStr(bStr, r));
else if (cs[i] instanceof C16Chunk)
ncs[i].addUUID(cs[i].at16l(r), cs[i].at16h(r));
else
ncs[i].addNum(cs[i].atd(r));
}
}
}
}
}.doAll(fr.types(), fr).outputFrame(newKey, fr.names(), fr.domains());
if (r.numRows() == 0) {
Log.warn("You asked for " + rows + " rows (out of " + fr.numRows() + "), but you got none (seed=" + seed + ").");
Log.warn("Let's try again. You've gotta ask yourself a question: \"Do I feel lucky?\"");
return sampleFrame(fr, rows, seed + 1);
}
return r;
}
use of water.parser.BufferedString in project h2o-3 by h2oai.
the class ExternalFrameWriterClientTest method testWriting.
@Test
public void testWriting() throws IOException {
final String[] nodes = new String[H2O.CLOUD._memary.length];
// get ip and ports of h2o nodes
for (int i = 0; i < nodes.length; i++) {
nodes[i] = H2O.CLOUD._memary[i].getIpPortString();
}
// we will open 2 connection per h2o node
final String[] connStrings = ArrayUtils.join(nodes, nodes);
// The api expects that empty frame has to be in the DKV before we start working with it
final String frameName = "fr";
String[] colNames = { "NUM", "BOOL", "STR", "TIMESTAMP" };
// vector types are inferred from expected types
final byte[] expectedTypes = ExternalFrameUtils.prepareExpectedTypes(new Class[] { Integer.class, Boolean.class, String.class, Timestamp.class });
ChunkUtils.initFrame(frameName, colNames);
// number of chunks will be number of h2o nodes
final long[] rowsPerChunk = new long[connStrings.length];
Thread[] threads = new Thread[connStrings.length];
// open all connections in connStrings array
for (int idx = 0; idx < connStrings.length; idx++) {
final int currentIndex = idx;
threads[idx] = new Thread() {
@Override
public void run() {
try {
ByteChannel sock = ExternalFrameUtils.getConnection(connStrings[currentIndex]);
ExternalFrameWriterClient writer = new ExternalFrameWriterClient(sock);
writer.createChunks(frameName, expectedTypes, currentIndex, 1000);
Timestamp time = new Timestamp(Calendar.getInstance().getTime().getTime());
for (int i = 0; i < 997; i++) {
writer.sendInt(i);
writer.sendBoolean(true);
writer.sendString("str_" + i);
writer.sendTimestamp(time);
}
writer.sendInt(0);
writer.sendBoolean(true);
writer.sendString(null);
writer.sendTimestamp(time);
writer.sendInt(1);
writer.sendBoolean(true);
writer.sendString("");
writer.sendTimestamp(time);
// send NA for all columns
writer.sendNA();
writer.sendNA();
writer.sendNA();
writer.sendNA();
writer.waitUntilAllWritten();
sock.close();
rowsPerChunk[currentIndex] = 1000;
} catch (IOException ignore) {
}
}
};
threads[idx].start();
}
// wait for all writer thread to finish
for (Thread t : threads) {
try {
t.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
ChunkUtils.finalizeFrame(frameName, rowsPerChunk, ExternalFrameUtils.vecTypesFromExpectedTypes(expectedTypes), null);
Frame frame = null;
try {
frame = DKV.getGet(frameName);
assertEquals(frame.anyVec().nChunks(), connStrings.length);
assertEquals(frame._names.length, 4);
assertEquals(frame.numCols(), 4);
assertEquals(frame._names[0], "NUM");
assertEquals(frame._names[1], "BOOL");
assertEquals(frame._names[2], "STR");
assertEquals(frame._names[3], "TIMESTAMP");
assertEquals(frame.vec(0).get_type(), Vec.T_NUM);
assertEquals(frame.vec(1).get_type(), Vec.T_NUM);
assertEquals(frame.vec(2).get_type(), Vec.T_STR);
assertEquals(frame.vec(3).get_type(), Vec.T_TIME);
assertEquals(frame.numRows(), 1000 * connStrings.length);
// last row should be NA
assertEquals(frame.vec(0).at8(0), 0);
BufferedString buff = new BufferedString();
assertEquals(frame.vec(2).atStr(buff, 996).toString(), "str_996");
assertEquals(frame.vec(2).atStr(buff, 997), null);
assertEquals(frame.vec(2).atStr(buff, 998).toString(), "");
assertTrue(frame.vec(0).isNA(999));
assertTrue(frame.vec(1).isNA(999));
assertTrue(frame.vec(2).isNA(999));
assertTrue(frame.vec(3).isNA(999));
} finally {
if (frame != null) {
frame.remove();
}
}
}
Aggregations