Search in sources :

Example 6 with KillJobsRequest

use of io.crate.execution.jobs.kill.KillJobsRequest in project crate by crate.

the class NodeDisconnectJobMonitorService method broadcastKillToParticipatingNodes.

/**
 * Broadcast the kill if *this* node is the coordinator and a participating node died
 * The information which nodes are participating is only available on the coordinator, so other nodes
 * can not kill the jobs on their own.
 *
 * <pre>
 *              n1                      n2                  n3
 *               |                      |                   |
 *           startJob 1 (n1,n2,n3)      |                   |
 *               |                      |                   |
 *               |                    *dies*                |
 *               |                                          |
 *           onNodeDisc(n2)                            onNodeDisc(n2)
 *            broadcast kill job1                   does not know which jobs involve n2
 *                  |
 *      kill job1 <-+---------------------------------->  kill job1
 *
 * </pre>
 */
private void broadcastKillToParticipatingNodes(DiscoveryNode deadNode) {
    List<UUID> affectedJobs = tasksService.getJobIdsByParticipatingNodes(deadNode.getId()).collect(Collectors.toList());
    if (affectedJobs.isEmpty()) {
        return;
    }
    if (LOGGER.isDebugEnabled()) {
        LOGGER.debug("Broadcasting kill for {} jobs because they involved disconnected node={}", affectedJobs.size(), deadNode.getId());
    }
    List<String> excludedNodeIds = Collections.singletonList(deadNode.getId());
    KillJobsRequest killRequest = new KillJobsRequest(affectedJobs, User.CRATE_USER.name(), "Participating node=" + deadNode.getName() + " disconnected.");
    killJobsNodeAction.broadcast(killRequest, new ActionListener<>() {

        @Override
        public void onResponse(Long numKilled) {
        }

        @Override
        public void onFailure(Exception e) {
            LOGGER.warn("failed to send kill request to nodes");
        }
    }, excludedNodeIds);
}
Also used : KillJobsRequest(io.crate.execution.jobs.kill.KillJobsRequest) UUID(java.util.UUID)

Example 7 with KillJobsRequest

use of io.crate.execution.jobs.kill.KillJobsRequest in project crate by crate.

the class NodeDisconnectJobMonitorServiceTest method testOnParticipatingNodeDisconnectedKillsJob.

@Test
public void testOnParticipatingNodeDisconnectedKillsJob() throws Exception {
    TasksService tasksService = tasksInstance();
    DiscoveryNode coordinator = newNode("coordinator");
    DiscoveryNode dataNode = newNode("dataNode");
    RootTask.Builder builder = tasksService.newBuilder(UUID.randomUUID(), "dummy-user", coordinator.getId(), Arrays.asList(coordinator.getId(), dataNode.getId()));
    builder.addTask(new DummyTask());
    tasksService.createTask(builder);
    // add a second job that is coordinated by the other node to make sure the the broadcast logic is run
    // even though there are jobs coordinated by the disconnected node
    builder = tasksService.newBuilder(UUID.randomUUID(), "dummy-user", dataNode.getId(), Collections.emptySet());
    builder.addTask(new DummyTask());
    tasksService.createTask(builder);
    AtomicInteger broadcasts = new AtomicInteger(0);
    TransportKillJobsNodeAction killAction = new TransportKillJobsNodeAction(tasksService, clusterService, mock(TransportService.class)) {

        @Override
        public void broadcast(KillJobsRequest request, ActionListener<Long> listener, Collection<String> excludedNodeIds) {
            broadcasts.incrementAndGet();
        }
    };
    NodeDisconnectJobMonitorService monitorService = new NodeDisconnectJobMonitorService(tasksService, new NodeLimits(new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS)), mock(TransportService.class), killAction);
    monitorService.onNodeDisconnected(dataNode, mock(Transport.Connection.class));
    assertThat(broadcasts.get(), is(1));
    monitorService.close();
}
Also used : TransportKillJobsNodeAction(io.crate.execution.jobs.kill.TransportKillJobsNodeAction) DiscoveryNode(org.elasticsearch.cluster.node.DiscoveryNode) DummyTask(io.crate.execution.jobs.DummyTask) ClusterSettings(org.elasticsearch.common.settings.ClusterSettings) TasksService(io.crate.execution.jobs.TasksService) RootTask(io.crate.execution.jobs.RootTask) ActionListener(org.elasticsearch.action.ActionListener) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TransportService(org.elasticsearch.transport.TransportService) KillJobsRequest(io.crate.execution.jobs.kill.KillJobsRequest) NodeLimits(io.crate.execution.jobs.NodeLimits) Collection(java.util.Collection) CrateDummyClusterServiceUnitTest(io.crate.test.integration.CrateDummyClusterServiceUnitTest) Test(org.junit.Test)

Aggregations

KillJobsRequest (io.crate.execution.jobs.kill.KillJobsRequest)7 TasksService (io.crate.execution.jobs.TasksService)3 TransportKillJobsNodeAction (io.crate.execution.jobs.kill.TransportKillJobsNodeAction)3 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 JobsLogs (io.crate.execution.engine.collect.stats.JobsLogs)2 CrateDummyClusterServiceUnitTest (io.crate.test.integration.CrateDummyClusterServiceUnitTest)2 Collection (java.util.Collection)2 UUID (java.util.UUID)2 ActionListener (org.elasticsearch.action.ActionListener)2 TransportService (org.elasticsearch.transport.TransportService)2 Test (org.junit.Test)2 TimeValue (io.crate.common.unit.TimeValue)1 JobKilledException (io.crate.exceptions.JobKilledException)1 TaskMissing (io.crate.exceptions.TaskMissing)1 RoutedCollectPhase (io.crate.execution.dsl.phases.RoutedCollectPhase)1 DummyTask (io.crate.execution.jobs.DummyTask)1 NodeLimits (io.crate.execution.jobs.NodeLimits)1 RootTask (io.crate.execution.jobs.RootTask)1 TransportJobAction (io.crate.execution.jobs.transport.TransportJobAction)1 Transports (io.crate.execution.support.Transports)1