Search in sources :

Example 1 with H2OCountedCompleter

use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.

the class GLM2 method checkKKTAndComplete.

protected void checkKKTAndComplete(final CountedCompleter cc, final GLMIterationTask glmt, final double[] newBeta, final boolean failedLineSearch) {
    H2OCountedCompleter cmp = (H2OCountedCompleter) cc;
    final double[] fullBeta = newBeta == null ? MemoryManager.malloc8d(_srcDinfo.fullN() + _intercept - _noffsets) : expandVec(newBeta, _activeCols);
    // now we need full gradient (on all columns) using this beta
    new GLMIterationTask(_noffsets, GLM2.this.self(), _srcDinfo, _glm, false, true, true, fullBeta, _ymu, 1.0 / _nobs, thresholds, new H2OCallback<GLMIterationTask>(cmp) {

        @Override
        public String toString() {
            return "checkKKTAndComplete.Callback, completer = " + getCompleter() == null ? "null" : getCompleter().toString();
        }

        @Override
        public void callback(final GLMIterationTask glmt2) {
            // first check KKT conditions!
            final double[] grad = glmt2.gradient(alpha[0], _currentLambda);
            if (Utils.hasNaNsOrInfs(grad)) {
                _failedLineSearch = true;
            // TODO: add warning and break the lambda search? Or throw Exception?
            }
            glmt._val = glmt2._val;
            _lastResult = makeIterationInfo(_iter, glmt2, null, glmt2.gradient(alpha[0], 0));
            // check the KKT conditions and filter data for next lambda_value
            // check the gradient
            double[] subgrad = grad.clone();
            ADMMSolver.subgrad(alpha[0], _currentLambda, fullBeta, subgrad);
            double grad_eps = GLM_GRAD_EPS;
            if (!failedLineSearch && _activeCols != null) {
                for (int c = 0; c < _activeCols.length - _noffsets; ++c) if (subgrad[_activeCols[c]] > grad_eps)
                    grad_eps = subgrad[_activeCols[c]];
                else if (subgrad[c] < -grad_eps)
                    grad_eps = -subgrad[_activeCols[c]];
                int[] failedCols = new int[64];
                int fcnt = 0;
                for (int i = 0; i < grad.length - 1; ++i) {
                    if (Arrays.binarySearch(_activeCols, i) >= 0)
                        continue;
                    if (subgrad[i] > grad_eps || -subgrad[i] > grad_eps) {
                        if (fcnt == failedCols.length)
                            failedCols = Arrays.copyOf(failedCols, failedCols.length << 1);
                        failedCols[fcnt++] = i;
                    }
                }
                if (fcnt > 0) {
                    final int n = _activeCols.length;
                    final int[] oldActiveCols = _activeCols;
                    _activeCols = Arrays.copyOf(_activeCols, _activeCols.length + fcnt);
                    for (int i = 0; i < fcnt; ++i) _activeCols[n + i] = failedCols[i];
                    Arrays.sort(_activeCols);
                    LogInfo(fcnt + " variables failed KKT conditions check! Adding them to the model and continuing computation.(grad_eps = " + grad_eps + ", activeCols = " + (_activeCols.length > 100 ? "lost" : Arrays.toString(_activeCols)));
                    _activeData = _srcDinfo.filterExpandedColumns(_activeCols);
                    // NOTE: tricky completer game here:
                    // We expect 0 pending in this method since this is the end-point, ( actually it's racy, can be 1 with pending 1 decrement from the original Iteration callback, end result is 0 though)
                    // while iteration expects pending count of 1, so we need to increase it here (Iteration itself adds 1 but 1 will be subtracted when we leave this method since we're in the callback which is called by onCompletion!
                    // [unlike at the start of nextLambda call when we're not inside onCompletion]))
                    getCompleter().addToPendingCount(1);
                    new GLMIterationTask(_noffsets, GLM2.this.self(), _activeData, _glm, true, true, true, resizeVec(newBeta, _activeCols, oldActiveCols), _ymu, glmt._reg, thresholds, new Iteration(getCompleter())).asyncExec(_activeData._adaptedFrame);
                    return;
                }
            }
            int diff = MAX_ITERATIONS_PER_LAMBDA - _iter + _iter1;
            if (diff > 0)
                // update progress
                new GLM2_ProgressUpdate(diff).fork(_progressKey);
            GLM2.this.setSubmodel(newBeta, glmt2._val, (H2OCountedCompleter) getCompleter().getCompleter());
            _done = true;
            LogInfo("computation of current lambda done in " + (System.currentTimeMillis() - GLM2.this.start_time) + "ms");
            assert _lastResult._fullGrad != null;
        }
    }).asyncExec(_srcDinfo._adaptedFrame);
}
Also used : H2OCallback(water.H2O.H2OCallback) GLMIterationTask(hex.glm.GLMTask.GLMIterationTask) H2OCountedCompleter(water.H2O.H2OCountedCompleter)

Example 2 with H2OCountedCompleter

use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.

the class GLM2 method run.

public void run(boolean doLog, H2OCountedCompleter cmp) {
    if (doLog)
        logStart();
    // just fork off the nfolds+1 tasks and wait for the results
    assert alpha.length == 1;
    start_time = System.currentTimeMillis();
    if (nlambdas == -1)
        nlambdas = 100;
    if (lambda_search && nlambdas <= 1)
        throw new IllegalArgumentException(LogInfo("GLM2: nlambdas must be > 1 when running with lambda search."));
    Futures fs = new Futures();
    Key dst = dest();
    new YMUTask(GLM2.this.self(), _srcDinfo, n_folds, new H2OCallback<YMUTask>(cmp) {

        @Override
        public String toString() {
            return "YMUTask callback. completer = " + getCompleter() != null ? "null" : getCompleter().toString();
        }

        @Override
        public void callback(final YMUTask ymut) {
            if (ymut._ymin == ymut._ymax)
                throw new IllegalArgumentException(LogInfo("GLM2: attempted to run with constant response. Response == " + ymut._ymin + " for all rows in the training set."));
            if (ymut.nobs() == 0)
                throw new IllegalArgumentException(LogInfo("GLM2: got no active rows in the dataset after discarding rows with NAs"));
            _ymu = ymut.ymu();
            _nobs = ymut.nobs();
            if (_glm.family == Family.binomial && prior != -1 && prior != _ymu && !Double.isNaN(prior)) {
                _iceptAdjust = -Math.log(_ymu * (1 - prior) / (prior * (1 - _ymu)));
            } else
                prior = _ymu;
            H2OCountedCompleter cmp = (H2OCountedCompleter) getCompleter();
            cmp.addToPendingCount(1);
            // public GLMIterationTask(int noff, Key jobKey, DataInfo dinfo, GLMParams glm, boolean computeGram, boolean validate, boolean computeGradient, double [] beta, double ymu, double reg, float [] thresholds, H2OCountedCompleter cmp) {
            new GLMIterationTask(_noffsets, GLM2.this.self(), _srcDinfo, _glm, false, true, true, nullModelBeta(_srcDinfo, _ymu), _ymu, 1.0 / _nobs, thresholds, new H2OCallback<GLMIterationTask>(cmp) {

                @Override
                public String toString() {
                    return "LMAXTask callback. completer = " + (getCompleter() != null ? "NULL" : getCompleter().toString());
                }

                @Override
                public void callback(final GLMIterationTask glmt) {
                    double[] beta = glmt._beta;
                    if (beta_start == null) {
                        beta_start = beta;
                    }
                    _nullDeviance = glmt._val.residualDeviance();
                    _currentLambda = lambda_max = Math.max(Utils.maxValue(glmt._grad), -Utils.minValue(glmt._grad)) / Math.max(1e-3, alpha[0]);
                    _lastResult = makeIterationInfo(0, glmt, null, glmt.gradient(0, 0));
                    GLMModel model = new GLMModel(GLM2.this, dest(), _srcDinfo, _glm, glmt._val, beta_epsilon, alpha[0], lambda_max, _ymu, prior);
                    model.start_training(start_time);
                    if (lambda_search) {
                        assert !Double.isNaN(lambda_max) : LogInfo("running lambda_value search, but don't know what is the lambda_value max!");
                        model = addLmaxSubmodel(model, glmt._val, beta);
                        if (nlambdas == -1) {
                            lambda = null;
                        } else {
                            if (lambda_min_ratio == -1)
                                lambda_min_ratio = _nobs > 25 * _srcDinfo.fullN() ? 1e-4 : 1e-2;
                            final double d = Math.pow(lambda_min_ratio, 1.0 / (nlambdas - 1));
                            if (nlambdas == 0)
                                throw new IllegalArgumentException("nlambdas must be > 0 when running lambda search.");
                            lambda = new double[nlambdas];
                            lambda[0] = lambda_max;
                            if (nlambdas == 1)
                                throw new IllegalArgumentException("Number of lambdas must be > 1 when running with lambda_search!");
                            for (int i = 1; i < lambda.length; ++i) lambda[i] = lambda[i - 1] * d;
                            lambda_min = lambda[lambda.length - 1];
                            max_iter = MAX_ITERATIONS_PER_LAMBDA * nlambdas;
                        }
                        _runAllLambdas = false;
                    } else {
                        if (lambda == null || lambda.length == 0)
                            lambda = new double[] { DEFAULT_LAMBDA };
                        int i = 0;
                        while (i < lambda.length && lambda[i] > lambda_max) ++i;
                        if (i == lambda.length)
                            throw new IllegalArgumentException("Given lambda(s) are all > lambda_max = " + lambda_max + ", have nothing to run with. lambda = " + Arrays.toString(lambda));
                        if (i > 0) {
                            model.addWarning("Removed " + i + " lambdas greater than lambda_max.");
                            lambda = Utils.append(new double[] { lambda_max }, Arrays.copyOfRange(lambda, i, lambda.length));
                            addLmaxSubmodel(model, glmt._val, beta);
                        }
                    }
                    model.delete_and_lock(self());
                    lambda_min = lambda[lambda.length - 1];
                    if (n_folds > 1) {
                        final H2OCountedCompleter futures = new H2OEmptyCompleter();
                        final GLM2[] xvals = new GLM2[n_folds + 1];
                        futures.addToPendingCount(xvals.length - 2);
                        for (int i = 0; i < xvals.length; ++i) {
                            xvals[i] = (GLM2) GLM2.this.clone();
                            xvals[i].n_folds = 0;
                            xvals[i].standardize = standardize;
                            xvals[i].family = family;
                            xvals[i].link = link;
                            xvals[i].beta_epsilon = beta_epsilon;
                            xvals[i].max_iter = max_iter;
                            xvals[i].variable_importances = variable_importances;
                            if (i != 0) {
                                xvals[i]._srcDinfo = _srcDinfo.getFold(i - 1, n_folds);
                                xvals[i].destination_key = Key.make(dest().toString() + "_xval_" + i, (byte) 1, Key.HIDDEN_USER_KEY, H2O.SELF);
                                xvals[i]._nobs = ymut.nobs(i - 1);
                                xvals[i]._ymu = ymut.ymu(i - 1);
                                final int fi = i;
                                final double ymu = ymut.ymu(fi - 1);
                                // new GLMIterationTask(offset_cols.length,GLM2.this.self(), _srcDinfo, _glm, false, true, true,nullModelBeta(),_ymu,1.0/_nobs, thresholds, new H2OCallback<GLMIterationTask>(cmp){
                                new GLMIterationTask(_noffsets, self(), xvals[i]._srcDinfo, _glm, false, true, true, nullModelBeta(xvals[fi]._srcDinfo, ymu), ymu, 1.0 / ymut.nobs(fi - 1), thresholds, new H2OCallback<GLMIterationTask>(futures) {

                                    @Override
                                    public String toString() {
                                        return "Xval LMAXTask callback., completer = " + getCompleter() == null ? "null" : getCompleter().toString();
                                    }

                                    @Override
                                    public void callback(GLMIterationTask t) {
                                        xvals[fi].beta_start = t._beta;
                                        xvals[fi]._currentLambda = xvals[fi].lambda_max = Math.max(Utils.maxValue(glmt._grad), -Utils.minValue(glmt._grad)) / Math.max(1e-3, alpha[0]);
                                        assert xvals[fi].lambda_max > 0;
                                        xvals[fi]._lastResult = makeIterationInfo(0, t, null, t.gradient(alpha[0], 0));
                                        //.delete_and_lock(self());
                                        GLMModel m = new GLMModel(GLM2.this, xvals[fi].destination_key, xvals[fi]._srcDinfo, _glm, t._val, beta_epsilon, alpha[0], xvals[fi].lambda_max, xvals[fi]._ymu, prior);
                                        m.submodels = new Submodel[] { new Submodel(xvals[fi].lambda_max, t._beta, t._beta, 0, 0, t._beta.length >= sparseCoefThreshold) };
                                        m.submodels[0].validation = t._val;
                                        assert t._val != null;
                                        m.setSubmodelIdx(0);
                                        m.delete_and_lock(self());
                                        if (xvals[fi].lambda_max > lambda_max) {
                                            futures.addToPendingCount(1);
                                            new ParallelGLMs(GLM2.this, new GLM2[] { xvals[fi] }, lambda_max, 1, futures).fork();
                                        }
                                    }
                                }).asyncExec(xvals[i]._srcDinfo._adaptedFrame);
                            }
                        }
                        _xvals = xvals;
                        futures.join();
                    }
                    getCompleter().addToPendingCount(1);
                    nextLambda(nextLambdaValue(), new LambdaIteration(getCompleter()));
                }
            }).asyncExec(_srcDinfo._adaptedFrame);
        }
    }).asyncExec(_srcDinfo._adaptedFrame);
}
Also used : Submodel(hex.glm.GLMModel.Submodel) YMUTask(hex.glm.GLMTask.YMUTask) GLMIterationTask(hex.glm.GLMTask.GLMIterationTask) H2OCallback(water.H2O.H2OCallback) H2OCountedCompleter(water.H2O.H2OCountedCompleter) H2OEmptyCompleter(water.H2O.H2OEmptyCompleter)

Example 3 with H2OCountedCompleter

use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.

the class FrameSplitter method compute2.

@Override
public void compute2() {
    // Lock all possible data
    dataset.read_lock(jobKey);
    // Create a template vector for each segment
    final Vec[][] templates = makeTemplates(dataset, ratios);
    final int nsplits = templates.length;
    assert nsplits == ratios.length + 1 : "Unexpected number of split templates!";
    // Launch number of distributed FJ for each split part
    final Vec[] datasetVecs = dataset.vecs();
    splits = new Frame[nsplits];
    for (int s = 0; s < nsplits; s++) {
        Frame split = new Frame(destKeys[s], dataset.names(), templates[s]);
        split.delete_and_lock(jobKey);
        splits[s] = split;
    }
    setPendingCount(1);
    H2O.submitTask(new H2OCountedCompleter(FrameSplitter.this) {

        @Override
        public void compute2() {
            setPendingCount(nsplits);
            for (int s = 0; s < nsplits; s++) {
                new FrameSplitTask(new // Completer for this task
                H2OCountedCompleter(// Completer for this task
                this) {

                    @Override
                    public void compute2() {
                    }

                    @Override
                    public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
                        synchronized (FrameSplitter.this) {
                            // synchronized on this since can be accessed from different workers
                            workersExceptions = workersExceptions != null ? Arrays.copyOf(workersExceptions, workersExceptions.length + 1) : new Throwable[1];
                            workersExceptions[workersExceptions.length - 1] = ex;
                        }
                        // we handle the exception so wait perform normal completion
                        tryComplete();
                        return false;
                    }
                }, datasetVecs, ratios, s).asyncExec(splits[s]);
            }
            // complete the computation of nsplits-tasks
            tryComplete();
        }
    });
    // complete the computation of thrown tasks
    tryComplete();
}
Also used : H2OCountedCompleter(water.H2O.H2OCountedCompleter) H2OCountedCompleter(water.H2O.H2OCountedCompleter) CountedCompleter(jsr166y.CountedCompleter)

Example 4 with H2OCountedCompleter

use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.

the class NeuralNet method startTrain.

void startTrain() {
    logStart();
    running = true;
    // Vec[] vecs = Utils.append(_train, response);
    // reChunk(vecs);
    // final Vec[] train = new Vec[vecs.length - 1];
    // System.arraycopy(vecs, 0, train, 0, train.length);
    // final Vec trainResp = classification ? vecs[vecs.length - 1].toEnum() : vecs[vecs.length - 1];
    final Vec[] train = _train;
    final Vec trainResp = classification ? response.toEnum() : response;
    final Layer[] ls = new Layer[hidden.length + 2];
    ls[0] = new VecsInput(train, null);
    for (int i = 0; i < hidden.length; i++) {
        switch(activation) {
            case Tanh:
                ls[i + 1] = new Tanh(hidden[i]);
                break;
            case TanhWithDropout:
                ls[i + 1] = new TanhDropout(hidden[i]);
                break;
            case Rectifier:
                ls[i + 1] = new Rectifier(hidden[i]);
                break;
            case RectifierWithDropout:
                ls[i + 1] = new RectifierDropout(hidden[i]);
                break;
            case Maxout:
                ls[i + 1] = new Maxout(hidden[i]);
                break;
            case MaxoutWithDropout:
                ls[i + 1] = new MaxoutDropout(hidden[i]);
                break;
        }
    }
    if (classification)
        ls[ls.length - 1] = new VecSoftmax(trainResp, null);
    else
        ls[ls.length - 1] = new VecLinear(trainResp, null);
    //copy parameters from NeuralNet, and set previous/input layer links
    for (int i = 0; i < ls.length; i++) ls[i].init(ls, i, this);
    final Key sourceKey = Key.make(input("source"));
    final Frame frame = new Frame(_names, train);
    frame.add(_responseName, trainResp);
    final Errors[] trainErrors0 = new Errors[] { new Errors() };
    final Errors[] validErrors0 = validation == null ? null : new Errors[] { new Errors() };
    NeuralNetModel model = new NeuralNetModel(destination_key, sourceKey, frame, ls, this);
    model.training_errors = trainErrors0;
    model.validation_errors = validErrors0;
    model.delete_and_lock(self());
    final Frame[] adapted = validation == null ? null : model.adapt(validation, false);
    final Trainer trainer;
    final long num_rows = source.numRows();
    if (mode == SingleThread) {
        Log.info("Entering single-threaded execution mode");
        trainer = new Trainer.Direct(ls, epochs, self());
    } else {
        // one node works on the first batch of points serially for improved stability
        if (warmup_samples > 0) {
            Log.info("Training the first " + warmup_samples + " samples in serial for improved stability.");
            Trainer warmup = new Trainer.Direct(ls, (double) warmup_samples / num_rows, self());
            warmup.start();
            warmup.join();
        //TODO: for MapReduce send weights from master VM to all other VMs
        }
        if (mode == SingleNode) {
            Log.info("Entering single-node (multi-threaded Hogwild) execution mode.");
            trainer = new Trainer.Threaded(ls, epochs, self(), -1);
        } else if (mode == MapReduce) {
            if (warmup_samples > 0 && mode == MapReduce) {
                Log.info("Multi-threaded warmup with " + warmup_samples + " samples.");
                Trainer warmup = new Trainer.Threaded(ls, (double) warmup_samples / num_rows, self(), -1);
                warmup.start();
                warmup.join();
            //TODO: for MapReduce send weights from master VM to all other VMs
            }
            Log.info("Entering multi-node (MapReduce + multi-threaded Hogwild) execution mode.");
            trainer = new Trainer.MapReduce(ls, epochs, self());
        } else
            throw new RuntimeException("invalid execution mode.");
    }
    Log.info("Running for " + epochs + " epochs.");
    final NeuralNet nn = this;
    // Use a separate thread for monitoring (blocked most of the time)
    Thread monitor = new Thread() {

        Errors[] trainErrors = trainErrors0, validErrors = validErrors0;

        @Override
        public void run() {
            try {
                Vec[] valid = null;
                Vec validResp = null;
                if (validation != null) {
                    assert adapted != null;
                    final Vec[] vs = adapted[0].vecs();
                    valid = Arrays.copyOf(vs, vs.length - 1);
                    System.arraycopy(adapted[0].vecs(), 0, valid, 0, valid.length);
                    validResp = vs[vs.length - 1];
                }
                //score the model every 2 seconds (or less often, if it takes longer to score)
                final long num_samples_total = (long) (Math.ceil(num_rows * epochs));
                long num = -1, last_eval = runTimeMs();
                do {
                    //time between evaluations
                    final long interval = (long) (score_interval * 1000);
                    long time_taken = runTimeMs() - last_eval;
                    if (num >= 0 && time_taken < interval) {
                        Thread.sleep(interval - time_taken);
                    }
                    last_eval = runTimeMs();
                    num = eval(valid, validResp);
                    if (num >= num_samples_total)
                        break;
                    if (mode != MapReduce) {
                        if (!isRunning(self()) || !running)
                            break;
                    } else {
                        //MapReduce calls cancel() early, we are waiting for running = false
                        if (!running)
                            break;
                    }
                } while (true);
                // remove validation data
                if (adapted != null && adapted[1] != null)
                    adapted[1].delete();
                Log.info("Training finished.");
            } catch (Exception ex) {
                cancel(ex);
            }
        }

        private long eval(Vec[] valid, Vec validResp) {
            long[][] cm = null;
            if (classification) {
                int classes = ls[ls.length - 1].units;
                cm = new long[classes][classes];
            }
            NeuralNetModel model = new NeuralNetModel(destination_key, sourceKey, frame, ls, nn);
            // score model on training set
            Errors e = eval(train, trainResp, score_training, valid == null ? cm : null);
            e.score_training = score_training == 0 ? train[0].length() : score_training;
            trainErrors = Utils.append(trainErrors, e);
            model.unstable |= Double.isNaN(e.mean_square) || Double.isNaN(e.cross_entropy);
            model.training_errors = trainErrors;
            // score model on validation set
            if (valid != null) {
                e = eval(valid, validResp, score_validation, cm);
                e.score_validation = score_validation == 0 ? valid[0].length() : score_validation;
                validErrors = Utils.append(validErrors, e);
                model.unstable |= Double.isNaN(e.mean_square) || Double.isNaN(e.cross_entropy);
            }
            model.validation_errors = validErrors;
            model.confusion_matrix = cm;
            model.update(self());
            // terminate model building if we detect that a model is unstable
            if (model.unstable)
                NeuralNet.running = false;
            return e.training_samples;
        }

        private Errors eval(Vec[] vecs, Vec resp, long n, long[][] cm) {
            Errors e = NeuralNet.eval(ls, vecs, resp, n, cm);
            e.training_samples = trainer.processed();
            e.training_time_ms = runTimeMs();
            return e;
        }
    };
    trainer.start();
    monitor.start();
    trainer.join();
    // Gracefully terminate the job submitted via H2O web API
    if (mode != MapReduce) {
        //tell the monitor thread to finish too
        running = false;
        try {
            monitor.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    } else {
        while (running) {
            //MapReduce will inform us that running = false
            try {
                Thread.sleep(1);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }
    // remove this job -> stop H2O interface from refreshing
    H2OCountedCompleter task = _fjtask;
    if (task != null)
        task.tryComplete();
    this.remove();
}
Also used : H2OCountedCompleter(water.H2O.H2OCountedCompleter) Layer(hex.Layer)

Example 5 with H2OCountedCompleter

use of water.H2O.H2OCountedCompleter in project h2o-2 by h2oai.

the class RPC method call.

// Make an initial RPC, or re-send a packet.  Always called on 1st send; also
// called on a timeout.
public synchronized RPC<V> call() {
    ++_callCnt;
    // add it to the RPC call.
    if (_dt.getCompleter() != null) {
        CountedCompleter cc = _dt.getCompleter();
        assert cc instanceof H2OCountedCompleter;
        boolean alreadyIn = false;
        if (_fjtasks != null)
            for (H2OCountedCompleter hcc : _fjtasks) if (hcc == cc)
                alreadyIn = true;
        if (!alreadyIn)
            addCompleter((H2OCountedCompleter) cc);
        _dt.setCompleter(null);
    }
    // If running on self, just submit to queues & do locally
    if (_target == H2O.SELF) {
        assert _dt.getCompleter() == null;
        _dt.setCompleter(new H2O.H2OCallback<DTask>() {

            @Override
            public void callback(DTask dt) {
                assert dt == _dt;
                synchronized (RPC.this) {
                    // F/J guarentees called once
                    assert !_done;
                    _done = true;
                    RPC.this.notifyAll();
                }
                doAllCompletions();
            }

            @Override
            public boolean onExceptionalCompletion(Throwable ex, CountedCompleter dt) {
                assert dt == _dt;
                synchronized (RPC.this) {
                    // Filter down to 1st exceptional completion
                    if (_done)
                        return true;
                    _dt.setException(ex);
                    // must be set as the last thing before notify, the waiting thread can wake up any at any time!
                    _done = true;
                    RPC.this.notifyAll();
                }
                doAllCompletions();
                return true;
            }
        });
        H2O.submitTask(_dt);
        return this;
    }
    // Keep a global record, for awhile
    if (_target != null)
        _target.taskPut(_tasknum, this);
    try {
        // We could be racing timeouts-vs-replies.  Blow off timeout if we have an answer.
        if (isDone()) {
            if (_target != null)
                _target.taskRemove(_tasknum);
            return this;
        }
        // send the basic UDP control packet.
        if (!_sentTcp) {
            // Ship the UDP packet!
            while (true) {
                // Retry loop for broken TCP sends
                AutoBuffer ab = new AutoBuffer(_target);
                try {
                    ab.putTask(UDP.udp.exec, _tasknum).put1(CLIENT_UDP_SEND).put(_dt);
                    boolean t = ab.hasTCP();
                    assert sz_check(ab) : "Resend of " + _dt.getClass() + " changes size from " + _size + " to " + ab.size() + " for task#" + _tasknum;
                    // Then close; send final byte
                    ab.close();
                    // Set after close (and any other possible fail)
                    _sentTcp = t;
                    // Break out of retry loop
                    break;
                } catch (AutoBuffer.AutoBufferException e) {
                    Log.info_no_DKV(Log.Tag.Sys.WATER, "IOException during RPC call: " + e._ioe.getMessage() + ",  AB=" + ab + ", for task#" + _tasknum + ", waiting and retrying...");
                    ab.drainClose();
                    try {
                        Thread.sleep(500);
                    } catch (InterruptedException ignore) {
                    }
                }
            }
        // end of while(true)
        } else {
            // Else it was sent via TCP in a prior attempt, and we've timed out.
            // This means the caller's ACK/answer probably got dropped and we need
            // him to resend it (or else the caller is still processing our
            // request).  Send a UDP reminder - but with the CLIENT_TCP_SEND flag
            // instead of the UDP send, and no DTask (since it previously went via
            // TCP, no need to resend it).
            AutoBuffer ab = new AutoBuffer(_target).putTask(UDP.udp.exec, _tasknum);
            ab.put1(CLIENT_TCP_SEND).close();
        }
        // Double retry until we exceed existing age.  This is the time to delay
        // until we try again.  Note that we come here immediately on creation,
        // so the first doubling happens before anybody does any waiting.  Also
        // note the generous 5sec cap: ping at least every 5 sec.
        _retry += (_retry < 5000) ? _retry : 5000;
        // Put self on the "TBD" list of tasks awaiting Timeout.
        // So: dont really 'forget' but remember me in a little bit.
        UDPTimeOutThread.PENDING.add(this);
        return this;
    } catch (Error t) {
        throw Log.err(t);
    }
}
Also used : H2OCountedCompleter(water.H2O.H2OCountedCompleter) H2OCountedCompleter(water.H2O.H2OCountedCompleter) CountedCompleter(jsr166y.CountedCompleter)

Aggregations

H2OCountedCompleter (water.H2O.H2OCountedCompleter)14 CountedCompleter (jsr166y.CountedCompleter)4 GLMIterationTask (hex.glm.GLMTask.GLMIterationTask)2 H2OCallback (water.H2O.H2OCallback)2 H2OEmptyCompleter (water.H2O.H2OEmptyCompleter)2 Layer (hex.Layer)1 GLMParameters (hex.glm.GLMModel.GLMParameters)1 GLMWeightsFun (hex.glm.GLMModel.GLMWeightsFun)1 Submodel (hex.glm.GLMModel.Submodel)1 YMUTask (hex.glm.GLMTask.YMUTask)1 Frame (water.fvec.Frame)1 Vec (water.fvec.Vec)1 VectorGroup (water.fvec.Vec.VectorGroup)1 BufferedString (water.parser.BufferedString)1 ExpectedExceptionForDebug (water.util.Utils.ExpectedExceptionForDebug)1