use of water.fvec.Chunk in project h2o-3 by h2oai.
the class VecUtils method stringToCategorical.
/**
* Create a new {@link Vec} of categorical values from string {@link Vec}.
*
* FIXME: implement in more efficient way with Brandon's primitives for BufferedString manipulation
*
* @param vec a string {@link Vec}
* @return a categorical {@link Vec}
*/
public static Vec stringToCategorical(Vec vec) {
final String[] vecDomain = new CollectStringVecDomain().domain(vec);
MRTask task = new MRTask() {
private transient java.util.HashMap<String, Integer> lookupTable;
@Override
protected void setupLocal() {
lookupTable = new java.util.HashMap<>(vecDomain.length);
for (int i = 0; i < vecDomain.length; i++) {
// FIXME: boxing
lookupTable.put(vecDomain[i], i);
}
}
@Override
public void map(Chunk c, NewChunk nc) {
BufferedString bs = new BufferedString();
for (int row = 0; row < c.len(); row++) {
if (c.isNA(row)) {
nc.addNA();
} else {
c.atStr(bs, row);
nc.addNum(lookupTable.get(bs.bytesToString()), 0);
}
}
}
};
// Invoke tasks - one input vector, one ouput vector
task.doAll(new byte[] { Vec.T_CAT }, vec);
// Return result
return task.outputFrame(null, null, new String[][] { vecDomain }).vec(0);
}
use of water.fvec.Chunk in project h2o-2 by h2oai.
the class Impute method serve.
@Override
protected Response serve() {
if (init())
return Inspect2.redirect(this, source._key.toString());
final int col_id = source.find(column);
final int[] _cols = group_by;
final Key mykey = Key.make();
try {
if (group_by == null) {
// just use "method" using the input "column"
double _replace_val = 0;
if (method == Method.mean) {
_replace_val = column.mean();
} else if (method == Method.median) {
QuantilesPage qp = new QuantilesPage();
qp.source_key = source;
qp.column = column;
qp.invoke();
_replace_val = qp.result;
} else if (method == Method.mode) {
String[] dom = column.domain();
long[][] levels = new long[1][];
levels[0] = new Vec.CollectDomain(column).doAll(new Frame(column)).domain();
long[][] counts = new ASTTable.Tabularize(levels).doAll(column)._counts;
long maxCounts = -1;
int mode = -1;
for (int i = 0; i < counts[0].length; ++i) {
if (counts[0][i] > maxCounts && !dom[i].equals("NA")) {
// check for "NA" in domain -- corner case from R
maxCounts = counts[0][i];
mode = i;
}
}
_replace_val = mode != -1 ? (double) mode : // could produce -1 if "NA" not in the domain -- that is we don't have the R corner case
(double) Arrays.asList(dom).indexOf("NA");
// OK to replace, since we're in the elif "mode" block
if (_replace_val == -1)
_replace_val = Double.NaN;
}
final double rv = _replace_val;
new MRTask2() {
@Override
public void map(Chunk[] cs) {
Chunk c = cs[col_id];
int rows = c.len();
for (int r = 0; r < rows; ++r) {
if (c.isNA0(r) || (c._vec.isEnum() && c._vec.domain()[(int) c.at0(r)].equals("NA"))) {
// leave as NA if replace value is NA
if (!Double.isNaN(rv))
c.set0(r, rv);
}
}
}
}.doAll(source);
} else {
// collect the groups HashMap and the frame from the ddply.
// create a vec of group IDs (each row is in some group)
// MRTask over the rows
water.exec.Exec2.exec(Key.make().toString() + " = anonymous <- function(x) \n{\n " + method + "(x[," + (col_id + 1) + "])\n}").remove_and_unlock();
Env env = water.exec.Exec2.exec(mykey.toString() + " = ddply(" + source._key.toString() + ", " + toAryString(_cols) + ", anonymous)");
final Frame grp_replacement = new Frame(env.peekAry());
env.remove_and_unlock();
Log.info("GROUP TASK NUM COLS: " + grp_replacement.numCols());
final GroupTask grp2val = new GroupTask(grp_replacement.numCols() - 1).doAll(grp_replacement);
new MRTask2() {
@Override
public void map(Chunk[] cs) {
Chunk c = cs[col_id];
int rows = cs[0].len();
for (int r = 0; r < rows; ++r) {
if (c.isNA0(r) || (c._vec.isEnum() && c._vec.domain()[(int) c.at0(r)].equals("NA"))) {
Group g = new Group(_cols.length);
g.fill(r, cs, _cols);
if (grp2val._grp2val.get(g) == null)
continue;
double rv = grp2val._grp2val.get(g);
c.set0(r, rv);
}
}
}
}.doAll(source);
}
return Inspect2.redirect(this, source._key.toString());
} catch (Throwable t) {
return Response.error(t);
} finally {
// Delete frames
UKV.remove(mykey);
}
}
use of water.fvec.Chunk in project h2o-2 by h2oai.
the class ChunkSummary method map.
@Override
public void map(Chunk[] cs) {
chunk_counts = new long[chunkTypes.length];
chunk_byte_sizes = new long[chunkTypes.length];
byte_size_per_node = new long[H2O.CLOUD.size()];
for (Chunk c : cs) {
boolean found = false;
for (int j = 0; j < chunkTypes.length; ++j) {
if (c.getClass().getSimpleName().equals(chunkTypes[j] + "Chunk")) {
found = true;
chunk_counts[j]++;
chunk_byte_sizes[j] += c.byteSize();
byte_size_per_node[H2O.SELF.index()] += c.byteSize();
}
}
if (!found) {
throw H2O.unimpl();
}
}
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class VecDataInputStream method fetchData.
private void fetchData(long position) {
Chunk chk = _v.chunkForRow(position);
_buffer = chk.asBytes();
_offset = chk.start();
_pos = (int) (position - _offset);
assert _buffer.length > 0;
}
use of water.fvec.Chunk in project h2o-3 by h2oai.
the class VecDataInputStream method read.
@Override
public int read(final long position, byte[] buffer, int offset, int length) throws IOException {
int loaded = 0;
long currentPosition = position;
while ((loaded < length) && (currentPosition < _v.length())) {
byte[] buff;
int pos;
if (inBuffer(currentPosition)) {
buff = _buffer;
pos = (int) (currentPosition - _offset);
} else {
Chunk chunk = _v.chunkForRow(currentPosition);
buff = chunk.asBytes();
pos = (int) (currentPosition - chunk.start());
}
int avail = Math.min(buff.length - pos, length - loaded);
System.arraycopy(buff, pos, buffer, offset + loaded, avail);
loaded += avail;
currentPosition += avail;
}
return loaded;
}
Aggregations