Search in sources :

Example 61 with NodeState

use of com.yahoo.vdslib.state.NodeState in project vespa by vespa-engine.

the class RpcServer method handleRpcRequests.

public boolean handleRpcRequests(ContentCluster cluster, ClusterState systemState, NodeStateOrHostInfoChangeHandler changeListener, NodeAddedOrRemovedListener addedListener) {
    boolean handledAnyRequests = false;
    if (!isConnected()) {
        long time = timer.getCurrentTimeInMillis();
        try {
            connect();
        } catch (ListenFailedException e) {
            if (!e.getMessage().equals(lastConnectError) || time - lastConnectErrorTime > 60 * 1000) {
                lastConnectError = e.getMessage();
                lastConnectErrorTime = time;
                log.log(LogLevel.WARNING, "Failed to bind RPC server to port " + port + ": " + e.getMessage());
            }
        } catch (Exception e) {
            if (!e.getMessage().equals(lastConnectError) || time - lastConnectErrorTime > 60 * 1000) {
                lastConnectError = e.getMessage();
                lastConnectErrorTime = time;
                log.log(LogLevel.WARNING, "Failed to initailize RPC server socket: " + e.getMessage());
            }
        }
    }
    for (int j = 0; j < 10; ++j) {
        // Max perform 10 RPC requests per cycle.
        Request req;
        synchronized (monitor) {
            if (rpcRequests.isEmpty())
                break;
            Iterator<Request> it = rpcRequests.iterator();
            req = it.next();
            it.remove();
            handledAnyRequests = true;
        }
        try {
            if (req.methodName().equals("getMaster")) {
                log.log(LogLevel.DEBUG, "Resolving RPC getMaster request");
                Integer master = masterHandler.getMaster();
                String masterReason = masterHandler.getMasterReason();
                req.returnValues().add(new Int32Value(master == null ? -1 : master));
                req.returnValues().add(new StringValue(masterReason == null ? "No reason given" : masterReason));
                req.returnRequest();
                continue;
            }
            if (!masterHandler.isMaster()) {
                throw new IllegalStateException("Refusing to answer RPC calls as we are not the master fleetcontroller.");
            }
            if (req.methodName().equals("getNodeList")) {
                log.log(LogLevel.DEBUG, "Resolving RPC getNodeList request");
                List<String> slobrok = new ArrayList<String>();
                List<String> rpc = new ArrayList<String>();
                for (NodeInfo node : cluster.getNodeInfo()) {
                    String s1 = node.getSlobrokAddress();
                    String s2 = node.getRpcAddress();
                    assert (s1 != null);
                    slobrok.add(s1);
                    rpc.add(s2 == null ? "" : s2);
                }
                req.returnValues().add(new StringArray(slobrok.toArray(new String[slobrok.size()])));
                req.returnValues().add(new StringArray(rpc.toArray(new String[rpc.size()])));
                req.returnRequest();
            } else if (req.methodName().equals("getSystemState")) {
                log.log(LogLevel.DEBUG, "Resolving RPC getSystemState request");
                req.returnValues().add(new StringValue(""));
                req.returnValues().add(new StringValue(systemState.toString(true)));
                req.returnRequest();
            } else if (req.methodName().equals("getNodeState")) {
                log.log(LogLevel.DEBUG, "Resolving RPC getNodeState request");
                NodeType nodeType = NodeType.get(req.parameters().get(0).asString());
                int nodeIndex = req.parameters().get(1).asInt32();
                Node node = new Node(nodeType, nodeIndex);
                // First parameter is current state in system state
                NodeState ns = systemState.getNodeState(node);
                req.returnValues().add(new StringValue(systemState.getNodeState(node).serialize()));
                // Second parameter is state node is reporting
                NodeInfo nodeInfo = cluster.getNodeInfo(node);
                if (nodeInfo == null)
                    throw new RuntimeException("No node " + node + " exists in cluster " + cluster.getName());
                NodeState fromNode = nodeInfo.getReportedState();
                req.returnValues().add(new StringValue(fromNode == null ? "unknown" : fromNode.serialize()));
                // Third parameter is state node has been requested to be in
                req.returnValues().add(new StringValue(nodeInfo.getWantedState().serialize()));
                // Fourth parameter is RPC address of node
                req.returnValues().add(new StringValue(nodeInfo.getRpcAddress() == null ? "" : nodeInfo.getRpcAddress()));
                req.returnRequest();
            } else if (req.methodName().equals("setNodeState")) {
                String slobrokAddress = req.parameters().get(0).asString();
                int lastSlash = slobrokAddress.lastIndexOf('/');
                int nextButLastSlash = slobrokAddress.lastIndexOf('/', lastSlash - 1);
                if (lastSlash == -1 || nextButLastSlash == -1) {
                    throw new IllegalStateException("Invalid slobrok address '" + slobrokAddress + "'.");
                }
                NodeType nodeType = NodeType.get(slobrokAddress.substring(nextButLastSlash + 1, lastSlash));
                Integer nodeIndex = Integer.valueOf(slobrokAddress.substring(lastSlash + 1));
                NodeInfo node = cluster.getNodeInfo(new Node(nodeType, nodeIndex));
                if (node == null)
                    throw new IllegalStateException("Cannot set wanted state of node " + new Node(nodeType, nodeIndex) + ". Index does not correspond to a configured node.");
                NodeState nodeState = NodeState.deserialize(nodeType, req.parameters().get(1).asString());
                if (nodeState.getDescription().equals("") && !nodeState.getState().equals(State.UP) && !nodeState.getState().equals(State.RETIRED)) {
                    nodeState.setDescription("Set by remote RPC client");
                }
                NodeState oldState = node.getUserWantedState();
                String message = (nodeState.getState().equals(State.UP) ? "Clearing wanted nodeState for node " + node : "New wantedstate '" + nodeState.toString() + "' stored for node " + node);
                if (!oldState.equals(nodeState) || !oldState.getDescription().equals(nodeState.getDescription())) {
                    if (!nodeState.getState().validWantedNodeState(nodeType)) {
                        throw new IllegalStateException("State " + nodeState.getState() + " can not be used as wanted state for node of type " + nodeType);
                    }
                    node.setWantedState(nodeState);
                    changeListener.handleNewWantedNodeState(node, nodeState);
                } else {
                    message = "Node " + node + " already had wanted state " + nodeState.toString();
                    log.log(LogLevel.DEBUG, message);
                }
                req.returnValues().add(new StringValue(message));
                req.returnRequest();
                if (nodeState.getState() == State.UP && node.getPrematureCrashCount() > 0) {
                    log.log(LogLevel.INFO, "Clearing premature crash count of " + node.getPrematureCrashCount() + " as wanted state was set to up");
                    node.setPrematureCrashCount(0);
                }
            }
        } catch (Exception e) {
            if (log.isLoggable(LogLevel.DEBUG)) {
                StringWriter sw = new StringWriter();
                e.printStackTrace(new PrintWriter(sw));
                log.log(LogLevel.DEBUG, "Failed RPC Request: " + sw);
            }
            String errorMsg = e.getMessage();
            if (errorMsg == null) {
                errorMsg = e.toString();
            }
            req.setError(ErrorCode.METHOD_FAILED, errorMsg);
            req.returnRequest();
        }
    }
    return handledAnyRequests;
}
Also used : NodeState(com.yahoo.vdslib.state.NodeState) Node(com.yahoo.vdslib.state.Node) Request(com.yahoo.jrt.Request) ArrayList(java.util.ArrayList) ListenFailedException(com.yahoo.jrt.ListenFailedException) UnknownHostException(java.net.UnknownHostException) ListenFailedException(com.yahoo.jrt.ListenFailedException) StringArray(com.yahoo.jrt.StringArray) StringWriter(java.io.StringWriter) NodeInfo(com.yahoo.vespa.clustercontroller.core.NodeInfo) NodeType(com.yahoo.vdslib.state.NodeType) Int32Value(com.yahoo.jrt.Int32Value) StringValue(com.yahoo.jrt.StringValue) PrintWriter(java.io.PrintWriter)

Example 62 with NodeState

use of com.yahoo.vdslib.state.NodeState in project vespa by vespa-engine.

the class NodeStateGatherer method sendMessages.

/**
 * Sends state requests to nodes that does not have one pending and is due
 * for another attempt.
 */
public boolean sendMessages(ContentCluster cluster, Communicator communicator, NodeStateOrHostInfoChangeHandler listener) {
    boolean sentAnyMessages = false;
    long currentTime = timer.getCurrentTimeInMillis();
    for (NodeInfo info : cluster.getNodeInfo()) {
        Long requestTime = info.getLatestNodeStateRequestTime();
        // pending request
        if (requestTime != null && (currentTime - requestTime < nodeStateRequestTimeoutMS))
            continue;
        // too early
        if (info.getTimeForNextStateRequestAttempt() > currentTime)
            continue;
        if (info.getRpcAddress() == null || info.isRpcAddressOutdated()) {
            // Cannot query state of node without RPC address
            log.log(LogLevel.DEBUG, "Not sending getNodeState request to node " + info.getNode() + ": Not in slobrok");
            NodeState reportedState = info.getReportedState().clone();
            if ((!reportedState.getState().equals(State.DOWN) && currentTime - info.getRpcAddressOutdatedTimestamp() > maxSlobrokDisconnectGracePeriod) || // Don't wait for grace period if we expect node to be stopping
            reportedState.getState().equals(State.STOPPING)) {
                log.log(LogLevel.DEBUG, "Setting reported state to DOWN " + (reportedState.getState().equals(State.STOPPING) ? "as node completed stopping." : "as node has been out of slobrok longer than " + maxSlobrokDisconnectGracePeriod + "."));
                if (reportedState.getState().oneOf("iur") || !reportedState.hasDescription()) {
                    StringBuilder sb = new StringBuilder().append("Set node down as it has been out of slobrok for ").append(currentTime - info.getRpcAddressOutdatedTimestamp()).append(" ms which is more than the max limit of ").append(maxSlobrokDisconnectGracePeriod).append(" ms.");
                    reportedState.setDescription(sb.toString());
                }
                reportedState.setState(State.DOWN);
                listener.handleNewNodeState(info, reportedState.clone());
            }
            // Must reset it to null to get connection attempts counted
            info.setReportedState(reportedState, currentTime);
            continue;
        }
        communicator.getNodeState(info, waiter);
        sentAnyMessages = true;
    }
    return sentAnyMessages;
}
Also used : NodeState(com.yahoo.vdslib.state.NodeState)

Example 63 with NodeState

use of com.yahoo.vdslib.state.NodeState in project vespa by vespa-engine.

the class NodeStateGatherer method handleError.

private NodeState handleError(GetNodeStateRequest req, NodeInfo info, long currentTime) {
    String prefix = "Failed get node state request: ";
    NodeState newState = new NodeState(info.getNode().getType(), State.DOWN);
    if (req.getReply().getReturnCode() == ErrorCode.TIMEOUT) {
        String msg = "RPC timeout";
        if (info.getReportedState().getState().oneOf("ui")) {
            eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(info, prefix + "RPC timeout talking to node.", NodeEvent.Type.REPORTED, currentTime), LogLevel.INFO);
        } else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) {
            log.log(LogLevel.DEBUG, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
        }
        newState.setDescription(msg);
    } else if (req.getReply().getReturnCode() == ErrorCode.CONNECTION) {
        Target target = info.lastRequestInfoConnection;
        Exception reason = (target == null ? null : target.getConnectionLostReason());
        if (reason != null) {
            String msg = reason.getMessage();
            if (msg == null)
                msg = "(null)";
            newState.setDescription(msg);
            if (msg.equals("Connection refused")) {
                msg = "Connection error: Connection refused";
                if (info.getReportedState().getState().oneOf("ui")) {
                    eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(info, prefix + msg, NodeEvent.Type.REPORTED, currentTime), LogLevel.INFO);
                } else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) {
                    log.log(LogLevel.DEBUG, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
                }
                newState.setState(State.DOWN);
            } else if (msg.equals("jrt: Connection closed by peer") || msg.equals("Connection reset by peer")) {
                msg = "Connection error: Closed at other end. (Node or switch likely shut down)";
                if (info.isRpcAddressOutdated()) {
                    msg += " Node is no longer in slobrok.";
                }
                if (info.getReportedState().getState().oneOf("ui")) {
                    eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(info, prefix + msg, NodeEvent.Type.REPORTED, currentTime), LogLevel.INFO);
                } else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) {
                    log.log(LogLevel.DEBUG, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
                }
                newState.setState(State.DOWN).setDescription(msg);
            } else if (msg.equals("Connection timed out")) {
                if (info.getReportedState().getState().oneOf("ui")) {
                    msg = "Connection error: Timeout";
                    eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(info, prefix + msg, NodeEvent.Type.REPORTED, currentTime), LogLevel.INFO);
                } else {
                    log.log(LogLevel.DEBUG, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
                }
            } else {
                msg = "Connection error: " + reason;
                if (info.getReportedState().getState().oneOf("ui")) {
                    eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(info, prefix + msg, NodeEvent.Type.REPORTED, currentTime), LogLevel.WARNING);
                } else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) {
                    log.log(LogLevel.DEBUG, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
                }
                newState.setDescription(msg);
            }
        } else {
            String msg = "Connection error: Unexpected error with no reason set. Assuming it is a network issue: " + req.getReply().getReturnCode() + ": " + req.getReply().getReturnMessage();
            if (info.getReportedState().getState().oneOf("ui")) {
                eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(info, prefix + msg, NodeEvent.Type.REPORTED, currentTime), LogLevel.WARNING);
            } else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) {
                log.log(LogLevel.DEBUG, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
            }
            newState.setDescription(msg);
        }
    } else if (req.getReply().getReturnCode() == Communicator.TRANSIENT_ERROR) {
        return null;
    } else if (req.getReply().getReturnCode() == ErrorCode.NO_SUCH_METHOD) {
        String msg = "no such RPC method error";
        if (info.getReportedState().getState().oneOf("ui")) {
            eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(info, prefix + msg, NodeEvent.Type.REPORTED, currentTime), LogLevel.WARNING);
        } else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) {
            log.log(LogLevel.DEBUG, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
        }
        newState.setState(State.DOWN).setDescription(msg + ": get node state");
    } else if (req.getReply().getReturnCode() == 75004) {
        String msg = "Node refused to answer RPC request and is likely stopping: " + req.getReply().getReturnMessage();
        // The node is shutting down and is not accepting requests from anyone
        if (info.getReportedState().getState().equals(State.STOPPING)) {
            log.log(LogLevel.DEBUG, "Failed to get node state from " + info + " because it is still shutting down.");
        } else {
            if (info.getReportedState().getState().oneOf("ui")) {
                eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(info, prefix + msg, NodeEvent.Type.REPORTED, currentTime), LogLevel.INFO);
            } else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) {
                log.log(LogLevel.DEBUG, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
            }
        }
        newState.setState(State.STOPPING).setDescription(msg);
    } else {
        String msg = "Got unexpected error, assumed to be node issue " + req.getReply().getReturnCode() + ": " + req.getReply().getReturnMessage();
        if (info.getReportedState().getState().oneOf("ui")) {
            eventLog.addNodeOnlyEvent(NodeEvent.forBaseline(info, prefix + msg, NodeEvent.Type.REPORTED, currentTime), LogLevel.WARNING);
        } else if (!info.getReportedState().hasDescription() || !info.getReportedState().getDescription().equals(msg)) {
            log.log(LogLevel.DEBUG, "Failed to talk to node " + info + ": " + req.getReply().getReturnCode() + " " + req.getReply().getReturnMessage() + ": " + msg);
        }
        newState.setState(State.DOWN).setDescription(msg);
    }
    return newState;
}
Also used : Target(com.yahoo.jrt.Target) NodeState(com.yahoo.vdslib.state.NodeState)

Example 64 with NodeState

use of com.yahoo.vdslib.state.NodeState in project vespa by vespa-engine.

the class PartitionStateRequest method calculateResult.

@Override
public Response.PartitionResponse calculateResult(RemoteClusterControllerTask.Context context) throws StateRestApiException {
    Response.PartitionResponse result = new Response.PartitionResponse();
    if (verboseReports.contains(VerboseReport.STATISTICS)) {
        fillInMetrics(context.cluster.getNodeInfo(id.getNode()).getHostInfo().getMetrics(), result);
    }
    NodeState nodeState = context.currentConsolidatedState.getNodeState(id.getNode());
    DiskState diskState = nodeState.getDiskState(id.getPartitionIndex());
    result.addState("generated", new Response.UnitStateImpl(diskState));
    return result;
}
Also used : Response(com.yahoo.vespa.clustercontroller.core.restapiv2.Response) NodeState(com.yahoo.vdslib.state.NodeState) DiskState(com.yahoo.vdslib.state.DiskState)

Example 65 with NodeState

use of com.yahoo.vdslib.state.NodeState in project vespa by vespa-engine.

the class SetNodeStateRequest method setDistributorWantedState.

/**
 * Set the wanted state on the distributor to something appropriate given the storage is being
 * set to (or is equal to) newStorageWantedState.
 */
private static void setDistributorWantedState(ContentCluster cluster, int index, NodeState newStorageWantedState, NodeStateOrHostInfoChangeHandler stateListener) {
    Node distributorNode = new Node(NodeType.DISTRIBUTOR, index);
    NodeInfo nodeInfo = cluster.getNodeInfo(distributorNode);
    if (nodeInfo == null) {
        throw new IllegalStateException("Missing distributor at index " + distributorNode.getIndex());
    }
    State newState;
    switch(newStorageWantedState.getState()) {
        case MAINTENANCE:
            newState = State.DOWN;
            break;
        case RETIRED:
            newState = State.UP;
            break;
        default:
            newState = newStorageWantedState.getState();
            if (!newState.validWantedNodeState(distributorNode.getType())) {
                throw new IllegalStateException("Distributor cannot be set to wanted state " + newState);
            }
    }
    NodeState newWantedState = new NodeState(distributorNode.getType(), newState);
    newWantedState.setDescription(newStorageWantedState.getDescription());
    NodeState currentWantedState = nodeInfo.getUserWantedState();
    if (newWantedState.getState() != currentWantedState.getState() || !Objects.equals(newWantedState.getDescription(), currentWantedState.getDescription())) {
        setNewWantedState(nodeInfo, newWantedState, stateListener);
    }
}
Also used : NodeState(com.yahoo.vdslib.state.NodeState) NodeInfo(com.yahoo.vespa.clustercontroller.core.NodeInfo) UnitState(com.yahoo.vespa.clustercontroller.utils.staterestapi.response.UnitState) ClusterState(com.yahoo.vdslib.state.ClusterState) NodeState(com.yahoo.vdslib.state.NodeState) State(com.yahoo.vdslib.state.State) Node(com.yahoo.vdslib.state.Node)

Aggregations

NodeState (com.yahoo.vdslib.state.NodeState)68 Node (com.yahoo.vdslib.state.Node)31 Test (org.junit.Test)30 ConfiguredNode (com.yahoo.vdslib.distribution.ConfiguredNode)21 ClusterState (com.yahoo.vdslib.state.ClusterState)11 NodeInfo (com.yahoo.vespa.clustercontroller.core.NodeInfo)6 Request (com.yahoo.jrt.Request)5 Target (com.yahoo.jrt.Target)5 State (com.yahoo.vdslib.state.State)5 ClusterFixture.storageNode (com.yahoo.vespa.clustercontroller.core.ClusterFixture.storageNode)5 HasStateReasonForNode.hasStateReasonForNode (com.yahoo.vespa.clustercontroller.core.matchers.HasStateReasonForNode.hasStateReasonForNode)5 Spec (com.yahoo.jrt.Spec)4 StringValue (com.yahoo.jrt.StringValue)4 Supervisor (com.yahoo.jrt.Supervisor)4 Transport (com.yahoo.jrt.Transport)4 DiskState (com.yahoo.vdslib.state.DiskState)4 ArrayList (java.util.ArrayList)4 PrintWriter (java.io.PrintWriter)3 StringWriter (java.io.StringWriter)3 HashSet (java.util.HashSet)3