use of water.fvec.Vec.VectorGroup in project h2o-2 by h2oai.
the class MRTask2 method compute2.
/** Called from FJ threads to do local work. The first called Task (which is
* also the last one to Complete) also reduces any global work. Called
* internal by F/J. Not expected to be user-called. */
@Override
public final void compute2() {
_t0 = System.nanoTime();
assert _left == null && _rite == null && _res == null;
_profile._mapstart = System.currentTimeMillis();
if (_hi - _lo >= 2) {
// Multi-chunk case: just divide-and-conquer to 1 chunk
// Mid-point
final int mid = (_lo + _hi) >>> 1;
_left = clone();
_rite = clone();
_left._profile = new MRProfile(this);
_rite._profile = new MRProfile(this);
// Reset mid-point
_left._hi = mid;
// Also set self mid-point
_rite._lo = mid;
// One fork awaiting completion
addToPendingCount(1);
// Runs in another thread/FJ instance
_left.fork();
// Runs in THIS F/J thread
_rite.compute2();
_profile._mapdone = System.currentTimeMillis();
// Not complete until the fork completes
return;
}
// Zero or 1 chunks, and further chunk might not be homed here
if (_hi > _lo) {
// Single chunk?
Vec v0 = _fr.anyVec();
if (_run_local || v0.chunkKey(_lo).home()) {
// And chunk is homed here?
// Make decompression chunk headers for these chunks
Vec[] vecs = _fr.vecs();
Chunk[] bvs = new Chunk[vecs.length];
NewChunk[] appendableChunks = null;
for (int i = 0; i < vecs.length; i++) if (vecs[i] != null) {
assert _run_local || vecs[i].chunkKey(_lo).home() : "Chunk=" + _lo + " v0=" + v0 + ", k=" + v0.chunkKey(_lo) + " v[" + i + "]=" + vecs[i] + ", k=" + vecs[i].chunkKey(_lo);
try {
bvs[i] = vecs[i].chunkForChunkIdx(_lo);
} catch (Throwable t) {
System.err.println("missing chunk in MRTask " + getClass().getName());
t.printStackTrace();
throw new RuntimeException(t);
}
}
if (_noutputs > 0) {
final VectorGroup vg = vecs[0].group();
_appendables = new AppendableVec[_noutputs];
appendableChunks = new NewChunk[_noutputs];
for (int i = 0; i < _appendables.length; ++i) {
_appendables[i] = new AppendableVec(vg.vecKey(_vid + i), new long[4], 0);
appendableChunks[i] = (NewChunk) _appendables[i].chunkForChunkIdx(_lo);
}
}
// Call all the various map() calls that apply
_profile._userstart = System.currentTimeMillis();
if (_fr.vecs().length == 1)
map(bvs[0]);
if (_fr.vecs().length == 2)
map(bvs[0], bvs[1]);
if (_fr.vecs().length == 3)
map(bvs[0], bvs[1], bvs[2]);
if (true)
map(bvs);
if (_noutputs == 1) {
// convenience versions for cases with single output.
if (_fr.vecs().length == 1)
map(bvs[0], appendableChunks[0]);
if (_fr.vecs().length == 2)
map(bvs[0], bvs[1], appendableChunks[0]);
if (_fr.vecs().length == 3)
map(bvs[0], bvs[1], bvs[2], appendableChunks[0]);
if (true)
map(bvs, appendableChunks[0]);
}
if (_noutputs == 2) {
// convenience versions for cases with 2 outputs (e.g split).
if (_fr.vecs().length == 1)
map(bvs[0], appendableChunks[0], appendableChunks[1]);
if (_fr.vecs().length == 2)
map(bvs[0], bvs[1], appendableChunks[0], appendableChunks[1]);
if (_fr.vecs().length == 3)
map(bvs[0], bvs[1], bvs[2], appendableChunks[0], appendableChunks[1]);
if (true)
map(bvs, appendableChunks[0], appendableChunks[1]);
}
map(bvs, appendableChunks);
// Save results since called map() at least once!
_res = self();
// Further D/K/V put any new vec results.
_profile._closestart = System.currentTimeMillis();
for (Chunk bv : bvs) bv.close(_lo, _fs);
if (_noutputs > 0)
for (NewChunk nch : appendableChunks) nch.close(_lo, _fs);
}
}
_profile._mapdone = System.currentTimeMillis();
// And this task is complete
tryComplete();
}
use of water.fvec.Vec.VectorGroup in project h2o-3 by h2oai.
the class ParseDataset method forkParseSVMLight.
public static Job forkParseSVMLight(final Key<Frame> dest, final Key[] keys, final ParseSetup setup) {
int nchunks = 0;
Vec v = null;
// set the parse chunk size for files
for (int i = 0; i < keys.length; ++i) {
Iced ice = DKV.getGet(keys[i]);
if (ice instanceof FileVec) {
if (i == 0)
v = ((FileVec) ice);
((FileVec) ice).setChunkSize(setup._chunk_size);
nchunks += ((FileVec) ice).nChunks();
Log.info("Parse chunk size " + setup._chunk_size);
} else if (ice instanceof Frame && ((Frame) ice).vec(0) instanceof FileVec) {
if (i == 0)
v = ((Frame) ice).vec(0);
((FileVec) ((Frame) ice).vec(0)).setChunkSize((Frame) ice, setup._chunk_size);
nchunks += (((Frame) ice).vec(0)).nChunks();
Log.info("Parse chunk size " + setup._chunk_size);
}
}
final VectorGroup vg = v.group();
final ParseDataset pds = new ParseDataset(dest);
// Write-Lock BEFORE returning
new Frame(pds._job._result, new String[0], new Vec[0]).delete_and_lock(pds._job);
return pds._job.start(new H2OCountedCompleter() {
@Override
public void compute2() {
ParseDataset.parseAllKeys(pds, keys, setup, true);
tryComplete();
}
}, nchunks);
}
use of water.fvec.Vec.VectorGroup in project h2o-3 by h2oai.
the class ParseDataset method parseAllKeys.
// --------------------------------------------------------------------------
// Top-level parser driver
private static ParseDataset parseAllKeys(ParseDataset pds, Key[] fkeys, ParseSetup setup, boolean deleteOnDone) {
final Job<Frame> job = pds._job;
assert setup._number_columns > 0;
if (setup._column_names != null && ((setup._column_names.length == 0) || (setup._column_names.length == 1 && setup._column_names[0].isEmpty())))
// // FIXME: annoyingly front end sends column names as String[] {""} even if setup returned null
setup._column_names = null;
if (setup._na_strings != null && setup._na_strings.length != setup._number_columns)
setup._na_strings = null;
if (fkeys.length == 0) {
job.stop();
return pds;
}
job.update(0, "Ingesting files.");
VectorGroup vg = getByteVec(fkeys[0]).group();
MultiFileParseTask mfpt = pds._mfpt = new MultiFileParseTask(vg, setup, job._key, fkeys, deleteOnDone);
mfpt.doAll(fkeys);
Log.trace("Done ingesting files.");
if (job.stop_requested())
return pds;
final AppendableVec[] avs = mfpt.vecs();
setup._column_names = getColumnNames(avs.length, setup._column_names);
Frame fr = null;
// Calculate categorical domain
// Filter down to columns with some categoricals
int n = 0;
int[] ecols2 = new int[avs.length];
for (int i = 0; i < avs.length; ++i) if (// Intended type is categorical (even though no domain has been set)?
avs[i].get_type() == Vec.T_CAT)
ecols2[n++] = i;
final int[] ecols = Arrays.copyOf(ecols2, n);
// If we have any, go gather unified categorical domains
if (n > 0) {
if (!setup.getParseType().isDomainProvided) {
// Domains are not provided via setup we need to collect them
job.update(0, "Collecting categorical domains across nodes.");
{
GatherCategoricalDomainsTask gcdt = new GatherCategoricalDomainsTask(mfpt._cKey, ecols).doAllNodes();
//Test domains for excessive length.
List<String> offendingColNames = new ArrayList<>();
for (int i = 0; i < ecols.length; i++) {
if (gcdt.getDomainLength(i) < Categorical.MAX_CATEGORICAL_COUNT) {
if (// The all-NA column
gcdt.getDomainLength(i) == 0)
// The all-NA column
avs[ecols[i]].setBad();
else
avs[ecols[i]].setDomain(gcdt.getDomain(i));
} else
offendingColNames.add(setup._column_names[ecols[i]]);
}
if (offendingColNames.size() > 0)
throw new H2OParseException("Exceeded categorical limit on columns " + offendingColNames + ". Consider reparsing these columns as a string.");
}
Log.trace("Done collecting categorical domains across nodes.");
} else {
// Ignore offending domains
for (int i = 0; i < ecols.length; i++) {
avs[ecols[i]].setDomain(setup._domains[ecols[i]]);
}
}
job.update(0, "Compressing data.");
fr = new Frame(job._result, setup._column_names, AppendableVec.closeAll(avs));
fr.update(job);
Log.trace("Done compressing data.");
if (!setup.getParseType().isDomainProvided) {
// Update categoricals to the globally agreed numbering
Vec[] evecs = new Vec[ecols.length];
for (int i = 0; i < evecs.length; ++i) evecs[i] = fr.vecs()[ecols[i]];
job.update(0, "Unifying categorical domains across nodes.");
{
// new CreateParse2GlobalCategoricalMaps(mfpt._cKey).doAll(evecs);
// Using Dtask since it starts and returns faster than an MRTask
CreateParse2GlobalCategoricalMaps[] fcdt = new CreateParse2GlobalCategoricalMaps[H2O.CLOUD.size()];
RPC[] rpcs = new RPC[H2O.CLOUD.size()];
for (int i = 0; i < fcdt.length; i++) {
H2ONode[] nodes = H2O.CLOUD.members();
fcdt[i] = new CreateParse2GlobalCategoricalMaps(mfpt._cKey, fr._key, ecols);
rpcs[i] = new RPC<>(nodes[i], fcdt[i]).call();
}
for (RPC rpc : rpcs) rpc.get();
new UpdateCategoricalChunksTask(mfpt._cKey, mfpt._chunk2ParseNodeMap).doAll(evecs);
MultiFileParseTask._categoricals.remove(mfpt._cKey);
}
Log.trace("Done unifying categoricals across nodes.");
}
} else {
// No categoricals case
job.update(0, "Compressing data.");
fr = new Frame(job._result, setup._column_names, AppendableVec.closeAll(avs));
Log.trace("Done closing all Vecs.");
}
// Check for job cancellation
if (job.stop_requested())
return pds;
// SVMLight is sparse format, there may be missing chunks with all 0s, fill them in
if (setup._parse_type.equals(SVMLight_INFO))
new SVFTask(fr).doAllNodes();
// Check for job cancellation
if (job.stop_requested())
return pds;
ParseWriter.ParseErr[] errs = ArrayUtils.append(setup._errs, mfpt._errors);
if (errs.length > 0) {
// compute global line numbers for warnings/errs
HashMap<String, Integer> fileChunkOffsets = new HashMap<>();
for (int i = 0; i < mfpt._fileChunkOffsets.length; ++i) fileChunkOffsets.put(fkeys[i].toString(), mfpt._fileChunkOffsets[i]);
long[] espc = fr.anyVec().espc();
for (int i = 0; i < errs.length; ++i) {
if (fileChunkOffsets.containsKey(errs[i]._file)) {
int espcOff = fileChunkOffsets.get(errs[i]._file);
errs[i]._gLineNum = espc[espcOff + errs[i]._cidx] + errs[i]._lineNum;
errs[i]._lineNum = errs[i]._gLineNum - espc[espcOff];
}
}
SortedSet<ParseWriter.ParseErr> s = new TreeSet<>(new Comparator<ParseWriter.ParseErr>() {
@Override
public int compare(ParseWriter.ParseErr o1, ParseWriter.ParseErr o2) {
long res = o1._gLineNum - o2._gLineNum;
if (res == 0)
res = o1._byteOffset - o2._byteOffset;
if (res == 0)
return o1._err.compareTo(o2._err);
return (int) res < 0 ? -1 : 1;
}
});
Collections.addAll(s, errs);
String[] warns = new String[s.size()];
int i = 0;
for (ParseWriter.ParseErr err : s) Log.warn(warns[i++] = err.toString());
job.setWarnings(warns);
}
job.update(0, "Calculating data summary.");
logParseResults(fr);
// Release the frame for overwriting
fr.update(job);
Frame fr2 = DKV.getGet(fr._key);
assert fr2._names.length == fr2.numCols();
fr.unlock(job);
// Remove CSV files from H2O memory
if (deleteOnDone)
for (Key k : fkeys) {
DKV.remove(k);
assert DKV.get(k) == null : "Input key " + k + " not deleted during parse";
}
return pds;
}
use of water.fvec.Vec.VectorGroup in project h2o-3 by h2oai.
the class MakeGLMModelHandler method computeGram.
public GramV3 computeGram(int v, GramV3 input) {
if (DKV.get(input.X.key()) == null)
throw new IllegalArgumentException("Frame " + input.X.key() + " does not exist.");
Frame fr = input.X.key().get();
Frame frcpy = new Frame(fr._names.clone(), fr.vecs().clone());
String wname = null;
Vec weight = null;
if (input.W != null && !input.W.column_name.isEmpty()) {
wname = input.W.column_name;
if (fr.find(wname) == -1)
throw new IllegalArgumentException("Did not find weight vector " + wname);
weight = frcpy.remove(wname);
}
DataInfo dinfo = new DataInfo(frcpy, null, 0, input.use_all_factor_levels, input.standardize ? TransformType.STANDARDIZE : TransformType.NONE, TransformType.NONE, input.skip_missing, false, !input.skip_missing, /* weight */
false, /* offset */
false, /* fold */
false, /* intercept */
true);
DKV.put(dinfo);
if (weight != null)
dinfo.setWeights(wname, weight);
Gram.GramTask gt = new Gram.GramTask(null, dinfo, false, true).doAll(dinfo._adaptedFrame);
double[][] gram = gt._gram.getXX();
dinfo.remove();
String[] names = water.util.ArrayUtils.append(dinfo.coefNames(), "Intercept");
Vec[] vecs = new Vec[gram.length];
Key[] keys = new VectorGroup().addVecs(vecs.length);
for (int i = 0; i < vecs.length; ++i) vecs[i] = Vec.makeVec(gram[i], keys[i]);
input.destination_frame = new KeyV3.FrameKeyV3();
String keyname = input.X.key().toString();
if (keyname.endsWith(".hex"))
keyname = keyname.substring(0, keyname.lastIndexOf("."));
keyname = keyname + "_gram";
if (weight != null)
keyname = keyname + "_" + wname;
Key k = Key.make(keyname);
if (DKV.get(k) != null) {
int cnt = 0;
while (cnt < 1000 && DKV.get(k = Key.make(keyname + "_" + cnt)) != null) cnt++;
if (cnt == 1000)
throw new IllegalArgumentException("unable to make unique key");
}
input.destination_frame.fillFromImpl(k);
DKV.put(new Frame(k, names, vecs));
return input;
}
use of water.fvec.Vec.VectorGroup in project h2o-3 by h2oai.
the class MRTask method compute2.
/** Called from FJ threads to do local work. The first called Task (which is
* also the last one to Complete) also reduces any global work. Called
* internal by F/J. Not expected to be user-called. */
@Override
public final void compute2() {
assert _left == null && _rite == null && _res == null;
if (_profile != null)
_profile._mapstart = System.currentTimeMillis();
if ((_hi - _lo) >= 2) {
// Multi-chunk case: just divide-and-conquer to 1 chunk
// Mid-point
final int mid = (_lo + _hi) >>> 1;
_left = copyAndInit();
_rite = copyAndInit();
// Reset mid-point
_left._hi = mid;
// Also set self mid-point
_rite._lo = mid;
// One fork awaiting completion
addToPendingCount(1);
// Runs in another thread/FJ instance
if (!isCompletedAbnormally())
_left.fork();
// Runs in THIS F/J thread
if (!isCompletedAbnormally())
_rite.compute2();
if (_profile != null)
_profile._mapdone = System.currentTimeMillis();
// Not complete until the fork completes
return;
}
// Zero or 1 chunks, and further chunk might not be homed here
if (_fr == null) {
// No Frame, so doing Keys?
if (// Once-per-node mode
_keys == null || _hi > _lo && _keys[_lo].home()) {
assert (_keys == null || !H2O.ARGS.client) : "Client node should not process any keys in MRTask!";
if (_profile != null)
_profile._userstart = System.currentTimeMillis();
if (_keys != null)
map(_keys[_lo]);
// Save results since called map() at least once!
_res = self();
if (_profile != null)
_profile._closestart = System.currentTimeMillis();
}
} else if (_hi > _lo) {
// Frame, Single chunk?
Vec v0 = _fr.anyVec();
if (_run_local || v0.chunkKey(_lo).home()) {
// And chunk is homed here?
assert (_run_local || !H2O.ARGS.client) : "Client node should not process any keys in MRTask!";
// Make decompression chunk headers for these chunks
Vec[] vecs = _fr.vecs();
Chunk[] bvs = new Chunk[vecs.length];
NewChunk[] appendableChunks = null;
for (int i = 0; i < vecs.length; i++) if (vecs[i] != null) {
assert _run_local || vecs[i].chunkKey(_lo).home() : "Chunk=" + _lo + " v0=" + v0 + ", k=" + v0.chunkKey(_lo) + " v[" + i + "]=" + vecs[i] + ", k=" + vecs[i].chunkKey(_lo);
bvs[i] = vecs[i].chunkForChunkIdx(_lo);
}
if (_output_types != null) {
final VectorGroup vg = vecs[0].group();
_appendables = new AppendableVec[_output_types.length];
appendableChunks = new NewChunk[_output_types.length];
for (int i = 0; i < _appendables.length; ++i) {
_appendables[i] = new AppendableVec(vg.vecKey(_vid + i), _output_types[i]);
appendableChunks[i] = _appendables[i].chunkForChunkIdx(_lo);
}
}
// Call all the various map() calls that apply
if (_profile != null)
_profile._userstart = System.currentTimeMillis();
int num_fr_vecs = _fr.vecs().length;
int num_outputs = _output_types == null ? 0 : _output_types.length;
if (num_outputs == 0) {
if (num_fr_vecs == 1)
map(bvs[0]);
else if (num_fr_vecs == 2)
map(bvs[0], bvs[1]);
else if (num_fr_vecs == 3)
map(bvs[0], bvs[1], bvs[2]);
map(bvs);
} else if (num_outputs == 1) {
// convenience versions for cases with single output.
assert appendableChunks != null;
if (num_fr_vecs == 1)
map(bvs[0], appendableChunks[0]);
else if (num_fr_vecs == 2)
map(bvs[0], bvs[1], appendableChunks[0]);
// else if (fr_vecs_length == 3) map(bvs[0], bvs[1], bvs[2], appendableChunks[0]);
map(bvs, appendableChunks[0]);
} else if (num_outputs == 2) {
// convenience versions for cases with 2 outputs (e.g split).
assert appendableChunks != null;
if (num_fr_vecs == 1)
map(bvs[0], appendableChunks[0], appendableChunks[1]);
// else if (fr_vecs_length == 2) map(bvs[0], bvs[1], appendableChunks[0], appendableChunks[1]);
// else if (fr_vecs_length == 3) map(bvs[0], bvs[1], bvs[2], appendableChunks[0], appendableChunks[1]);
map(bvs, appendableChunks[0], appendableChunks[1]);
}
if (num_outputs >= 0)
map(bvs, appendableChunks);
// Save results since called map() at least once!
_res = self();
// Further D/K/V put any new vec results.
if (_profile != null)
_profile._closestart = System.currentTimeMillis();
for (Chunk bv : bvs) bv.close(_lo, _fs);
if (_output_types != null)
for (NewChunk nch : appendableChunks) nch.close(_lo, _fs);
}
}
if (_profile != null)
_profile._mapdone = System.currentTimeMillis();
tryComplete();
}
Aggregations