Search in sources :

Example 6 with H2OCountedCompleter

use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.

the class NeuralNet method startTrain.

void startTrain() {
    logStart();
    running = true;
    // Vec[] vecs = Utils.append(_train, response);
    // reChunk(vecs);
    // final Vec[] train = new Vec[vecs.length - 1];
    // System.arraycopy(vecs, 0, train, 0, train.length);
    // final Vec trainResp = classification ? vecs[vecs.length - 1].toEnum() : vecs[vecs.length - 1];
    final Vec[] train = _train;
    final Vec trainResp = classification ? response.toEnum() : response;
    final Layer[] ls = new Layer[hidden.length + 2];
    ls[0] = new VecsInput(train, null);
    for (int i = 0; i < hidden.length; i++) {
        switch(activation) {
            case Tanh:
                ls[i + 1] = new Tanh(hidden[i]);
                break;
            case TanhWithDropout:
                ls[i + 1] = new TanhDropout(hidden[i]);
                break;
            case Rectifier:
                ls[i + 1] = new Rectifier(hidden[i]);
                break;
            case RectifierWithDropout:
                ls[i + 1] = new RectifierDropout(hidden[i]);
                break;
            case Maxout:
                ls[i + 1] = new Maxout(hidden[i]);
                break;
            case MaxoutWithDropout:
                ls[i + 1] = new MaxoutDropout(hidden[i]);
                break;
        }
    }
    if (classification)
        ls[ls.length - 1] = new VecSoftmax(trainResp, null);
    else
        ls[ls.length - 1] = new VecLinear(trainResp, null);
    //copy parameters from NeuralNet, and set previous/input layer links
    for (int i = 0; i < ls.length; i++) ls[i].init(ls, i, this);
    final Key sourceKey = Key.make(input("source"));
    final Frame frame = new Frame(_names, train);
    frame.add(_responseName, trainResp);
    final Errors[] trainErrors0 = new Errors[] { new Errors() };
    final Errors[] validErrors0 = validation == null ? null : new Errors[] { new Errors() };
    NeuralNetModel model = new NeuralNetModel(destination_key, sourceKey, frame, ls, this);
    model.training_errors = trainErrors0;
    model.validation_errors = validErrors0;
    model.delete_and_lock(self());
    final Frame[] adapted = validation == null ? null : model.adapt(validation, false);
    final Trainer trainer;
    final long num_rows = source.numRows();
    if (mode == SingleThread) {
        Log.info("Entering single-threaded execution mode");
        trainer = new Trainer.Direct(ls, epochs, self());
    } else {
        // one node works on the first batch of points serially for improved stability
        if (warmup_samples > 0) {
            Log.info("Training the first " + warmup_samples + " samples in serial for improved stability.");
            Trainer warmup = new Trainer.Direct(ls, (double) warmup_samples / num_rows, self());
            warmup.start();
            warmup.join();
        //TODO: for MapReduce send weights from master VM to all other VMs
        }
        if (mode == SingleNode) {
            Log.info("Entering single-node (multi-threaded Hogwild) execution mode.");
            trainer = new Trainer.Threaded(ls, epochs, self(), -1);
        } else if (mode == MapReduce) {
            if (warmup_samples > 0 && mode == MapReduce) {
                Log.info("Multi-threaded warmup with " + warmup_samples + " samples.");
                Trainer warmup = new Trainer.Threaded(ls, (double) warmup_samples / num_rows, self(), -1);
                warmup.start();
                warmup.join();
            //TODO: for MapReduce send weights from master VM to all other VMs
            }
            Log.info("Entering multi-node (MapReduce + multi-threaded Hogwild) execution mode.");
            trainer = new Trainer.MapReduce(ls, epochs, self());
        } else
            throw new RuntimeException("invalid execution mode.");
    }
    Log.info("Running for " + epochs + " epochs.");
    final NeuralNet nn = this;
    // Use a separate thread for monitoring (blocked most of the time)
    Thread monitor = new Thread() {

        Errors[] trainErrors = trainErrors0, validErrors = validErrors0;

        @Override
        public void run() {
            try {
                Vec[] valid = null;
                Vec validResp = null;
                if (validation != null) {
                    assert adapted != null;
                    final Vec[] vs = adapted[0].vecs();
                    valid = Arrays.copyOf(vs, vs.length - 1);
                    System.arraycopy(adapted[0].vecs(), 0, valid, 0, valid.length);
                    validResp = vs[vs.length - 1];
                }
                //score the model every 2 seconds (or less often, if it takes longer to score)
                final long num_samples_total = (long) (Math.ceil(num_rows * epochs));
                long num = -1, last_eval = runTimeMs();
                do {
                    //time between evaluations
                    final long interval = (long) (score_interval * 1000);
                    long time_taken = runTimeMs() - last_eval;
                    if (num >= 0 && time_taken < interval) {
                        Thread.sleep(interval - time_taken);
                    }
                    last_eval = runTimeMs();
                    num = eval(valid, validResp);
                    if (num >= num_samples_total)
                        break;
                    if (mode != MapReduce) {
                        if (!isRunning(self()) || !running)
                            break;
                    } else {
                        //MapReduce calls cancel() early, we are waiting for running = false
                        if (!running)
                            break;
                    }
                } while (true);
                // remove validation data
                if (adapted != null && adapted[1] != null)
                    adapted[1].delete();
                Log.info("Training finished.");
            } catch (Exception ex) {
                cancel(ex);
            }
        }

        private long eval(Vec[] valid, Vec validResp) {
            long[][] cm = null;
            if (classification) {
                int classes = ls[ls.length - 1].units;
                cm = new long[classes][classes];
            }
            NeuralNetModel model = new NeuralNetModel(destination_key, sourceKey, frame, ls, nn);
            // score model on training set
            Errors e = eval(train, trainResp, score_training, valid == null ? cm : null);
            e.score_training = score_training == 0 ? train[0].length() : score_training;
            trainErrors = Utils.append(trainErrors, e);
            model.unstable |= Double.isNaN(e.mean_square) || Double.isNaN(e.cross_entropy);
            model.training_errors = trainErrors;
            // score model on validation set
            if (valid != null) {
                e = eval(valid, validResp, score_validation, cm);
                e.score_validation = score_validation == 0 ? valid[0].length() : score_validation;
                validErrors = Utils.append(validErrors, e);
                model.unstable |= Double.isNaN(e.mean_square) || Double.isNaN(e.cross_entropy);
            }
            model.validation_errors = validErrors;
            model.confusion_matrix = cm;
            model.update(self());
            // terminate model building if we detect that a model is unstable
            if (model.unstable)
                NeuralNet.running = false;
            return e.training_samples;
        }

        private Errors eval(Vec[] vecs, Vec resp, long n, long[][] cm) {
            Errors e = NeuralNet.eval(ls, vecs, resp, n, cm);
            e.training_samples = trainer.processed();
            e.training_time_ms = runTimeMs();
            return e;
        }
    };
    trainer.start();
    monitor.start();
    trainer.join();
    // Gracefully terminate the job submitted via H2O web API
    if (mode != MapReduce) {
        //tell the monitor thread to finish too
        running = false;
        try {
            monitor.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    } else {
        while (running) {
            //MapReduce will inform us that running = false
            try {
                Thread.sleep(1);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }
    // remove this job -> stop H2O interface from refreshing
    H2OCountedCompleter task = _fjtask;
    if (task != null)
        task.tryComplete();
    this.remove();
}
Also used : H2OCountedCompleter(water.H2O.H2OCountedCompleter) Layer(hex.Layer)

Example 7 with H2OCountedCompleter

use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.

the class RPC method call.

// Make an initial RPC, or re-send a packet.  Always called on 1st send; also
// called on a timeout.
public synchronized RPC<V> call() {
    ++_callCnt;
    // add it to the RPC call.
    if (_dt.getCompleter() != null) {
        CountedCompleter cc = _dt.getCompleter();
        assert cc instanceof H2OCountedCompleter;
        boolean alreadyIn = false;
        if (_fjtasks != null)
            for (H2OCountedCompleter hcc : _fjtasks) if (hcc == cc)
                alreadyIn = true;
        if (!alreadyIn)
            addCompleter((H2OCountedCompleter) cc);
        _dt.setCompleter(null);
    }
    // If running on self, just submit to queues & do locally
    if (_target == H2O.SELF) {
        assert _dt.getCompleter() == null;
        _dt.setCompleter(new H2O.H2OCallback<DTask>() {

            @Override
            public void callback(DTask dt) {
                assert dt == _dt;
                synchronized (RPC.this) {
                    // F/J guarentees called once
                    assert !_done;
                    _done = true;
                    RPC.this.notifyAll();
                }
                doAllCompletions();
            }

            @Override
            public boolean onExceptionalCompletion(Throwable ex, CountedCompleter dt) {
                assert dt == _dt;
                synchronized (RPC.this) {
                    // Filter down to 1st exceptional completion
                    if (_done)
                        return true;
                    _dt.setException(ex);
                    // must be set as the last thing before notify, the waiting thread can wake up any at any time!
                    _done = true;
                    RPC.this.notifyAll();
                }
                doAllCompletions();
                return true;
            }
        });
        H2O.submitTask(_dt);
        return this;
    }
    // Keep a global record, for awhile
    if (_target != null)
        _target.taskPut(_tasknum, this);
    try {
        // We could be racing timeouts-vs-replies.  Blow off timeout if we have an answer.
        if (isDone()) {
            if (_target != null)
                _target.taskRemove(_tasknum);
            return this;
        }
        // send the basic UDP control packet.
        if (!_sentTcp) {
            // Ship the UDP packet!
            while (true) {
                // Retry loop for broken TCP sends
                AutoBuffer ab = new AutoBuffer(_target);
                try {
                    ab.putTask(UDP.udp.exec, _tasknum).put1(CLIENT_UDP_SEND).put(_dt);
                    boolean t = ab.hasTCP();
                    assert sz_check(ab) : "Resend of " + _dt.getClass() + " changes size from " + _size + " to " + ab.size() + " for task#" + _tasknum;
                    // Then close; send final byte
                    ab.close();
                    // Set after close (and any other possible fail)
                    _sentTcp = t;
                    // Break out of retry loop
                    break;
                } catch (AutoBuffer.AutoBufferException e) {
                    Log.info_no_DKV(Log.Tag.Sys.WATER, "IOException during RPC call: " + e._ioe.getMessage() + ",  AB=" + ab + ", for task#" + _tasknum + ", waiting and retrying...");
                    ab.drainClose();
                    try {
                        Thread.sleep(500);
                    } catch (InterruptedException ignore) {
                    }
                }
            }
        // end of while(true)
        } else {
            // Else it was sent via TCP in a prior attempt, and we've timed out.
            // This means the caller's ACK/answer probably got dropped and we need
            // him to resend it (or else the caller is still processing our
            // request).  Send a UDP reminder - but with the CLIENT_TCP_SEND flag
            // instead of the UDP send, and no DTask (since it previously went via
            // TCP, no need to resend it).
            AutoBuffer ab = new AutoBuffer(_target).putTask(UDP.udp.exec, _tasknum);
            ab.put1(CLIENT_TCP_SEND).close();
        }
        // Double retry until we exceed existing age.  This is the time to delay
        // until we try again.  Note that we come here immediately on creation,
        // so the first doubling happens before anybody does any waiting.  Also
        // note the generous 5sec cap: ping at least every 5 sec.
        _retry += (_retry < 5000) ? _retry : 5000;
        // Put self on the "TBD" list of tasks awaiting Timeout.
        // So: dont really 'forget' but remember me in a little bit.
        UDPTimeOutThread.PENDING.add(this);
        return this;
    } catch (Error t) {
        throw Log.err(t);
    }
}
Also used : H2OCountedCompleter(water.H2O.H2OCountedCompleter) H2OCountedCompleter(water.H2O.H2OCountedCompleter) CountedCompleter(jsr166y.CountedCompleter)

Example 8 with H2OCountedCompleter

use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.

the class Job method cancel.

private void cancel(final String msg, JobState resultingState) {
    if (resultingState == JobState.CANCELLED) {
        Log.info("Job " + self() + "(" + description + ") was cancelled.");
    } else {
        Log.err("Job " + self() + "(" + description + ") failed.");
        Log.err(msg);
    }
    exception = msg;
    state = resultingState;
    // replace finished job by a job handle
    replaceByJobHandle();
    DKV.write_barrier();
    final Job job = this;
    H2O.submitTask(new H2OCountedCompleter() {

        @Override
        public void compute2() {
            job.onCancelled();
        }
    });
}
Also used : H2OCountedCompleter(water.H2O.H2OCountedCompleter)

Example 9 with H2OCountedCompleter

use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.

the class Job method fork.

/**
   * Forks computation of this job.
   *
   * <p>The call does not block.</p>
   * @return always returns this job.
   */
public Job fork() {
    init();
    H2OCountedCompleter task = new H2OCountedCompleter() {

        @Override
        public void compute2() {
            try {
                try {
                    // Exec always waits till the end of computation
                    Job.this.exec();
                    Job.this.remove();
                } catch (Throwable t) {
                    if (!(t instanceof ExpectedExceptionForDebug))
                        Log.err(t);
                    Job.this.cancel(t);
                }
            } finally {
                tryComplete();
            }
        }
    };
    start(task);
    H2O.submitTask(task);
    return this;
}
Also used : ExpectedExceptionForDebug(water.util.Utils.ExpectedExceptionForDebug) H2OCountedCompleter(water.H2O.H2OCountedCompleter)

Example 10 with H2OCountedCompleter

use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.

the class SharedTreeModelBuilder method buildLayer.

// --------------------------------------------------------------------------
// Build an entire layer of all K trees
protected DHistogram[][][] buildLayer(final Frame fr, final DTree[] ktrees, final int[] leafs, final DHistogram[][][] hcs, boolean subset, boolean build_tree_one_node) {
    // Build K trees, one per class.
    // Build up the next-generation tree splits from the current histograms.
    // Nearly all leaves will split one more level.  This loop nest is
    //           O( #active_splits * #bins * #ncols )
    // but is NOT over all the data.
    H2OCountedCompleter[] sb1ts = new H2OCountedCompleter[_nclass];
    Vec[] vecs = fr.vecs();
    for (int k = 0; k < _nclass; k++) {
        // Tree for class K
        final DTree tree = ktrees[k];
        if (tree == null)
            continue;
        // Build a frame with just a single tree (& work & nid) columns, so the
        // nested MRTask2 ScoreBuildHistogram in ScoreBuildOneTree does not try
        // to close other tree's Vecs when run in parallel.
        Frame fr2 = new Frame(Arrays.copyOf(fr._names, _ncols + 1), Arrays.copyOf(vecs, _ncols + 1));
        fr2.add(fr._names[_ncols + 1 + k], vecs[_ncols + 1 + k]);
        fr2.add(fr._names[_ncols + 1 + _nclass + k], vecs[_ncols + 1 + _nclass + k]);
        fr2.add(fr._names[_ncols + 1 + _nclass + _nclass + k], vecs[_ncols + 1 + _nclass + _nclass + k]);
        // Start building one of the K trees in parallel
        H2O.submitTask(sb1ts[k] = new ScoreBuildOneTree(k, tree, leafs, hcs, fr2, subset, build_tree_one_node));
    }
    // Block for all K trees to complete.
    boolean did_split = false;
    for (int k = 0; k < _nclass; k++) {
        // Tree for class K
        final DTree tree = ktrees[k];
        if (tree == null)
            continue;
        sb1ts[k].join();
        if (((ScoreBuildOneTree) sb1ts[k])._did_split)
            did_split = true;
    }
    // The layer is done.
    return did_split ? hcs : null;
}
Also used : Frame(water.fvec.Frame) Vec(water.fvec.Vec) H2OCountedCompleter(water.H2O.H2OCountedCompleter)

Aggregations

H2OCountedCompleter (water.H2O.H2OCountedCompleter)14 CountedCompleter (jsr166y.CountedCompleter)4 GLMIterationTask (hex.glm.GLMTask.GLMIterationTask)2 H2OCallback (water.H2O.H2OCallback)2 H2OEmptyCompleter (water.H2O.H2OEmptyCompleter)2 Layer (hex.Layer)1 GLMParameters (hex.glm.GLMModel.GLMParameters)1 GLMWeightsFun (hex.glm.GLMModel.GLMWeightsFun)1 Submodel (hex.glm.GLMModel.Submodel)1 YMUTask (hex.glm.GLMTask.YMUTask)1 Frame (water.fvec.Frame)1 Vec (water.fvec.Vec)1 VectorGroup (water.fvec.Vec.VectorGroup)1 BufferedString (water.parser.BufferedString)1 ExpectedExceptionForDebug (water.util.Utils.ExpectedExceptionForDebug)1