use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.
the class NeuralNet method startTrain.
void startTrain() {
logStart();
running = true;
// Vec[] vecs = Utils.append(_train, response);
// reChunk(vecs);
// final Vec[] train = new Vec[vecs.length - 1];
// System.arraycopy(vecs, 0, train, 0, train.length);
// final Vec trainResp = classification ? vecs[vecs.length - 1].toEnum() : vecs[vecs.length - 1];
final Vec[] train = _train;
final Vec trainResp = classification ? response.toEnum() : response;
final Layer[] ls = new Layer[hidden.length + 2];
ls[0] = new VecsInput(train, null);
for (int i = 0; i < hidden.length; i++) {
switch(activation) {
case Tanh:
ls[i + 1] = new Tanh(hidden[i]);
break;
case TanhWithDropout:
ls[i + 1] = new TanhDropout(hidden[i]);
break;
case Rectifier:
ls[i + 1] = new Rectifier(hidden[i]);
break;
case RectifierWithDropout:
ls[i + 1] = new RectifierDropout(hidden[i]);
break;
case Maxout:
ls[i + 1] = new Maxout(hidden[i]);
break;
case MaxoutWithDropout:
ls[i + 1] = new MaxoutDropout(hidden[i]);
break;
}
}
if (classification)
ls[ls.length - 1] = new VecSoftmax(trainResp, null);
else
ls[ls.length - 1] = new VecLinear(trainResp, null);
//copy parameters from NeuralNet, and set previous/input layer links
for (int i = 0; i < ls.length; i++) ls[i].init(ls, i, this);
final Key sourceKey = Key.make(input("source"));
final Frame frame = new Frame(_names, train);
frame.add(_responseName, trainResp);
final Errors[] trainErrors0 = new Errors[] { new Errors() };
final Errors[] validErrors0 = validation == null ? null : new Errors[] { new Errors() };
NeuralNetModel model = new NeuralNetModel(destination_key, sourceKey, frame, ls, this);
model.training_errors = trainErrors0;
model.validation_errors = validErrors0;
model.delete_and_lock(self());
final Frame[] adapted = validation == null ? null : model.adapt(validation, false);
final Trainer trainer;
final long num_rows = source.numRows();
if (mode == SingleThread) {
Log.info("Entering single-threaded execution mode");
trainer = new Trainer.Direct(ls, epochs, self());
} else {
// one node works on the first batch of points serially for improved stability
if (warmup_samples > 0) {
Log.info("Training the first " + warmup_samples + " samples in serial for improved stability.");
Trainer warmup = new Trainer.Direct(ls, (double) warmup_samples / num_rows, self());
warmup.start();
warmup.join();
//TODO: for MapReduce send weights from master VM to all other VMs
}
if (mode == SingleNode) {
Log.info("Entering single-node (multi-threaded Hogwild) execution mode.");
trainer = new Trainer.Threaded(ls, epochs, self(), -1);
} else if (mode == MapReduce) {
if (warmup_samples > 0 && mode == MapReduce) {
Log.info("Multi-threaded warmup with " + warmup_samples + " samples.");
Trainer warmup = new Trainer.Threaded(ls, (double) warmup_samples / num_rows, self(), -1);
warmup.start();
warmup.join();
//TODO: for MapReduce send weights from master VM to all other VMs
}
Log.info("Entering multi-node (MapReduce + multi-threaded Hogwild) execution mode.");
trainer = new Trainer.MapReduce(ls, epochs, self());
} else
throw new RuntimeException("invalid execution mode.");
}
Log.info("Running for " + epochs + " epochs.");
final NeuralNet nn = this;
// Use a separate thread for monitoring (blocked most of the time)
Thread monitor = new Thread() {
Errors[] trainErrors = trainErrors0, validErrors = validErrors0;
@Override
public void run() {
try {
Vec[] valid = null;
Vec validResp = null;
if (validation != null) {
assert adapted != null;
final Vec[] vs = adapted[0].vecs();
valid = Arrays.copyOf(vs, vs.length - 1);
System.arraycopy(adapted[0].vecs(), 0, valid, 0, valid.length);
validResp = vs[vs.length - 1];
}
//score the model every 2 seconds (or less often, if it takes longer to score)
final long num_samples_total = (long) (Math.ceil(num_rows * epochs));
long num = -1, last_eval = runTimeMs();
do {
//time between evaluations
final long interval = (long) (score_interval * 1000);
long time_taken = runTimeMs() - last_eval;
if (num >= 0 && time_taken < interval) {
Thread.sleep(interval - time_taken);
}
last_eval = runTimeMs();
num = eval(valid, validResp);
if (num >= num_samples_total)
break;
if (mode != MapReduce) {
if (!isRunning(self()) || !running)
break;
} else {
//MapReduce calls cancel() early, we are waiting for running = false
if (!running)
break;
}
} while (true);
// remove validation data
if (adapted != null && adapted[1] != null)
adapted[1].delete();
Log.info("Training finished.");
} catch (Exception ex) {
cancel(ex);
}
}
private long eval(Vec[] valid, Vec validResp) {
long[][] cm = null;
if (classification) {
int classes = ls[ls.length - 1].units;
cm = new long[classes][classes];
}
NeuralNetModel model = new NeuralNetModel(destination_key, sourceKey, frame, ls, nn);
// score model on training set
Errors e = eval(train, trainResp, score_training, valid == null ? cm : null);
e.score_training = score_training == 0 ? train[0].length() : score_training;
trainErrors = Utils.append(trainErrors, e);
model.unstable |= Double.isNaN(e.mean_square) || Double.isNaN(e.cross_entropy);
model.training_errors = trainErrors;
// score model on validation set
if (valid != null) {
e = eval(valid, validResp, score_validation, cm);
e.score_validation = score_validation == 0 ? valid[0].length() : score_validation;
validErrors = Utils.append(validErrors, e);
model.unstable |= Double.isNaN(e.mean_square) || Double.isNaN(e.cross_entropy);
}
model.validation_errors = validErrors;
model.confusion_matrix = cm;
model.update(self());
// terminate model building if we detect that a model is unstable
if (model.unstable)
NeuralNet.running = false;
return e.training_samples;
}
private Errors eval(Vec[] vecs, Vec resp, long n, long[][] cm) {
Errors e = NeuralNet.eval(ls, vecs, resp, n, cm);
e.training_samples = trainer.processed();
e.training_time_ms = runTimeMs();
return e;
}
};
trainer.start();
monitor.start();
trainer.join();
// Gracefully terminate the job submitted via H2O web API
if (mode != MapReduce) {
//tell the monitor thread to finish too
running = false;
try {
monitor.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
} else {
while (running) {
//MapReduce will inform us that running = false
try {
Thread.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
// remove this job -> stop H2O interface from refreshing
H2OCountedCompleter task = _fjtask;
if (task != null)
task.tryComplete();
this.remove();
}
use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.
the class RPC method call.
// Make an initial RPC, or re-send a packet. Always called on 1st send; also
// called on a timeout.
public synchronized RPC<V> call() {
++_callCnt;
// add it to the RPC call.
if (_dt.getCompleter() != null) {
CountedCompleter cc = _dt.getCompleter();
assert cc instanceof H2OCountedCompleter;
boolean alreadyIn = false;
if (_fjtasks != null)
for (H2OCountedCompleter hcc : _fjtasks) if (hcc == cc)
alreadyIn = true;
if (!alreadyIn)
addCompleter((H2OCountedCompleter) cc);
_dt.setCompleter(null);
}
// If running on self, just submit to queues & do locally
if (_target == H2O.SELF) {
assert _dt.getCompleter() == null;
_dt.setCompleter(new H2O.H2OCallback<DTask>() {
@Override
public void callback(DTask dt) {
assert dt == _dt;
synchronized (RPC.this) {
// F/J guarentees called once
assert !_done;
_done = true;
RPC.this.notifyAll();
}
doAllCompletions();
}
@Override
public boolean onExceptionalCompletion(Throwable ex, CountedCompleter dt) {
assert dt == _dt;
synchronized (RPC.this) {
// Filter down to 1st exceptional completion
if (_done)
return true;
_dt.setException(ex);
// must be set as the last thing before notify, the waiting thread can wake up any at any time!
_done = true;
RPC.this.notifyAll();
}
doAllCompletions();
return true;
}
});
H2O.submitTask(_dt);
return this;
}
// Keep a global record, for awhile
if (_target != null)
_target.taskPut(_tasknum, this);
try {
// We could be racing timeouts-vs-replies. Blow off timeout if we have an answer.
if (isDone()) {
if (_target != null)
_target.taskRemove(_tasknum);
return this;
}
// send the basic UDP control packet.
if (!_sentTcp) {
// Ship the UDP packet!
while (true) {
// Retry loop for broken TCP sends
AutoBuffer ab = new AutoBuffer(_target);
try {
ab.putTask(UDP.udp.exec, _tasknum).put1(CLIENT_UDP_SEND).put(_dt);
boolean t = ab.hasTCP();
assert sz_check(ab) : "Resend of " + _dt.getClass() + " changes size from " + _size + " to " + ab.size() + " for task#" + _tasknum;
// Then close; send final byte
ab.close();
// Set after close (and any other possible fail)
_sentTcp = t;
// Break out of retry loop
break;
} catch (AutoBuffer.AutoBufferException e) {
Log.info_no_DKV(Log.Tag.Sys.WATER, "IOException during RPC call: " + e._ioe.getMessage() + ", AB=" + ab + ", for task#" + _tasknum + ", waiting and retrying...");
ab.drainClose();
try {
Thread.sleep(500);
} catch (InterruptedException ignore) {
}
}
}
// end of while(true)
} else {
// Else it was sent via TCP in a prior attempt, and we've timed out.
// This means the caller's ACK/answer probably got dropped and we need
// him to resend it (or else the caller is still processing our
// request). Send a UDP reminder - but with the CLIENT_TCP_SEND flag
// instead of the UDP send, and no DTask (since it previously went via
// TCP, no need to resend it).
AutoBuffer ab = new AutoBuffer(_target).putTask(UDP.udp.exec, _tasknum);
ab.put1(CLIENT_TCP_SEND).close();
}
// Double retry until we exceed existing age. This is the time to delay
// until we try again. Note that we come here immediately on creation,
// so the first doubling happens before anybody does any waiting. Also
// note the generous 5sec cap: ping at least every 5 sec.
_retry += (_retry < 5000) ? _retry : 5000;
// Put self on the "TBD" list of tasks awaiting Timeout.
// So: dont really 'forget' but remember me in a little bit.
UDPTimeOutThread.PENDING.add(this);
return this;
} catch (Error t) {
throw Log.err(t);
}
}
use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.
the class Job method cancel.
private void cancel(final String msg, JobState resultingState) {
if (resultingState == JobState.CANCELLED) {
Log.info("Job " + self() + "(" + description + ") was cancelled.");
} else {
Log.err("Job " + self() + "(" + description + ") failed.");
Log.err(msg);
}
exception = msg;
state = resultingState;
// replace finished job by a job handle
replaceByJobHandle();
DKV.write_barrier();
final Job job = this;
H2O.submitTask(new H2OCountedCompleter() {
@Override
public void compute2() {
job.onCancelled();
}
});
}
use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.
the class Job method fork.
/**
* Forks computation of this job.
*
* <p>The call does not block.</p>
* @return always returns this job.
*/
public Job fork() {
init();
H2OCountedCompleter task = new H2OCountedCompleter() {
@Override
public void compute2() {
try {
try {
// Exec always waits till the end of computation
Job.this.exec();
Job.this.remove();
} catch (Throwable t) {
if (!(t instanceof ExpectedExceptionForDebug))
Log.err(t);
Job.this.cancel(t);
}
} finally {
tryComplete();
}
}
};
start(task);
H2O.submitTask(task);
return this;
}
use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.
the class SharedTreeModelBuilder method buildLayer.
// --------------------------------------------------------------------------
// Build an entire layer of all K trees
protected DHistogram[][][] buildLayer(final Frame fr, final DTree[] ktrees, final int[] leafs, final DHistogram[][][] hcs, boolean subset, boolean build_tree_one_node) {
// Build K trees, one per class.
// Build up the next-generation tree splits from the current histograms.
// Nearly all leaves will split one more level. This loop nest is
// O( #active_splits * #bins * #ncols )
// but is NOT over all the data.
H2OCountedCompleter[] sb1ts = new H2OCountedCompleter[_nclass];
Vec[] vecs = fr.vecs();
for (int k = 0; k < _nclass; k++) {
// Tree for class K
final DTree tree = ktrees[k];
if (tree == null)
continue;
// Build a frame with just a single tree (& work & nid) columns, so the
// nested MRTask2 ScoreBuildHistogram in ScoreBuildOneTree does not try
// to close other tree's Vecs when run in parallel.
Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1));
fr2.add(fr._names[_ncols + 1 + k], vecs[_ncols + 1 + k]);
fr2.add(fr._names[_ncols + 1 + _nclass + k], vecs[_ncols + 1 + _nclass + k]);
fr2.add(fr._names[_ncols + 1 + _nclass + _nclass + k], vecs[_ncols + 1 + _nclass + _nclass + k]);
// Start building one of the K trees in parallel
H2O.submitTask(sb1ts[k] = new ScoreBuildOneTree(k, tree, leafs, hcs, fr2, subset, build_tree_one_node));
}
// Block for all K trees to complete.
boolean did_split = false;
for (int k = 0; k < _nclass; k++) {
// Tree for class K
final DTree tree = ktrees[k];
if (tree == null)
continue;
sb1ts[k].join();
if (((ScoreBuildOneTree) sb1ts[k])._did_split)
did_split = true;
}
// The layer is done.
return did_split ? hcs : null;
}
Aggregations