Search in sources :

Example 21 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class ClusterFormationFailureHelperTests method testDescriptionAfterDetachCluster.

public void testDescriptionAfterDetachCluster() {
    final DiscoveryNode localNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), Version.CURRENT);
    final ClusterState clusterState = state(localNode, VotingConfiguration.MUST_JOIN_ELECTED_MASTER.getNodeIds().toArray(new String[0]));
    assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet and this node was detached from its previous cluster, " + "have discovered []; " + "discovery will continue using [] from hosts providers and [" + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
    final TransportAddress otherAddress = buildNewFakeTransportAddress();
    assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet and this node was detached from its previous cluster, " + "have discovered []; " + "discovery will continue using [" + otherAddress + "] from hosts providers and [" + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
    final DiscoveryNode otherNode = new DiscoveryNode("otherNode", buildNewFakeTransportAddress(), Version.CURRENT);
    assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet and this node was detached from its previous cluster, " + "have discovered [" + otherNode + "]; " + "discovery will continue using [] from hosts providers and [" + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
    final DiscoveryNode yetAnotherNode = new DiscoveryNode("yetAnotherNode", buildNewFakeTransportAddress(), Version.CURRENT);
    assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(yetAnotherNode), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet and this node was detached from its previous cluster, " + "have discovered [" + yetAnotherNode + "]; " + "discovery will continue using [] from hosts providers and [" + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) StatusInfo(org.opensearch.monitor.StatusInfo) TransportAddress(org.opensearch.common.transport.TransportAddress) ClusterFormationState(org.opensearch.cluster.coordination.ClusterFormationFailureHelper.ClusterFormationState)

Example 22 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class ClusterFormationFailureHelperTests method testDescriptionForBWCState.

public void testDescriptionForBWCState() {
    final DiscoveryNode localNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
    final ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metadata(Metadata.builder().version(// check that we use metadata version in case of BWC term 0
    42L).coordinationMetadata(CoordinationMetadata.builder().term(Coordinator.ZEN1_BWC_TERM).build()).build()).nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
    assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 15L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet: have discovered []; discovery will continue using [] from hosts providers " + "and [] from last-known cluster state; node term 15, last-accepted version 42 in term 0"));
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) StatusInfo(org.opensearch.monitor.StatusInfo) ClusterFormationState(org.opensearch.cluster.coordination.ClusterFormationFailureHelper.ClusterFormationState)

Example 23 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class ClusterFormationFailureHelperTests method testScheduling.

public void testScheduling() {
    final long expectedDelayMillis;
    final Settings.Builder settingsBuilder = Settings.builder();
    if (randomBoolean()) {
        expectedDelayMillis = ClusterFormationFailureHelper.DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING.get(Settings.EMPTY).millis();
    } else {
        expectedDelayMillis = randomLongBetween(100, 100000);
        settingsBuilder.put(ClusterFormationFailureHelper.DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING.getKey(), expectedDelayMillis + "ms");
    }
    final DiscoveryNode localNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
    final ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
    final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(Settings.builder().put(NODE_NAME_SETTING.getKey(), "node").build(), random());
    final AtomicLong warningCount = new AtomicLong();
    final AtomicLong logLastFailedJoinAttemptWarningCount = new AtomicLong();
    final ClusterFormationFailureHelper clusterFormationFailureHelper = new ClusterFormationFailureHelper(settingsBuilder.build(), () -> {
        warningCount.incrementAndGet();
        return new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info"));
    }, deterministicTaskQueue.getThreadPool(), logLastFailedJoinAttemptWarningCount::incrementAndGet);
    deterministicTaskQueue.runAllTasks();
    assertThat("should not schedule anything yet", warningCount.get(), is(0L));
    final long startTimeMillis = deterministicTaskQueue.getCurrentTimeMillis();
    clusterFormationFailureHelper.start();
    while (warningCount.get() == 0) {
        assertTrue(clusterFormationFailureHelper.isRunning());
        if (deterministicTaskQueue.hasRunnableTasks()) {
            deterministicTaskQueue.runRandomTask();
        } else {
            deterministicTaskQueue.advanceTime();
        }
    }
    assertThat(warningCount.get(), is(1L));
    assertThat(deterministicTaskQueue.getCurrentTimeMillis() - startTimeMillis, is(expectedDelayMillis));
    while (warningCount.get() < 5) {
        assertTrue(clusterFormationFailureHelper.isRunning());
        if (deterministicTaskQueue.hasRunnableTasks()) {
            deterministicTaskQueue.runRandomTask();
        } else {
            deterministicTaskQueue.advanceTime();
        }
    }
    assertThat(deterministicTaskQueue.getCurrentTimeMillis() - startTimeMillis, equalTo(5 * expectedDelayMillis));
    clusterFormationFailureHelper.stop();
    assertFalse(clusterFormationFailureHelper.isRunning());
    deterministicTaskQueue.runAllTasksInTimeOrder();
    assertThat(warningCount.get(), is(5L));
    assertThat(logLastFailedJoinAttemptWarningCount.get(), is(5L));
    warningCount.set(0);
    logLastFailedJoinAttemptWarningCount.set(0);
    clusterFormationFailureHelper.start();
    clusterFormationFailureHelper.stop();
    clusterFormationFailureHelper.start();
    final long secondStartTimeMillis = deterministicTaskQueue.getCurrentTimeMillis();
    while (warningCount.get() < 5) {
        assertTrue(clusterFormationFailureHelper.isRunning());
        if (deterministicTaskQueue.hasRunnableTasks()) {
            deterministicTaskQueue.runRandomTask();
        } else {
            deterministicTaskQueue.advanceTime();
        }
    }
    assertThat(deterministicTaskQueue.getCurrentTimeMillis() - secondStartTimeMillis, equalTo(5 * expectedDelayMillis));
    clusterFormationFailureHelper.stop();
    assertFalse(clusterFormationFailureHelper.isRunning());
    deterministicTaskQueue.runAllTasksInTimeOrder();
    assertThat(warningCount.get(), is(5L));
    assertThat(logLastFailedJoinAttemptWarningCount.get(), is(5L));
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) AtomicLong(java.util.concurrent.atomic.AtomicLong) StatusInfo(org.opensearch.monitor.StatusInfo) ClusterFormationState(org.opensearch.cluster.coordination.ClusterFormationFailureHelper.ClusterFormationState) Settings(org.opensearch.common.settings.Settings)

Example 24 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class FsHealthService method getHealth.

@Override
public StatusInfo getHealth() {
    StatusInfo statusInfo;
    Set<Path> unhealthyPaths = this.unhealthyPaths;
    if (enabled == false) {
        statusInfo = new StatusInfo(HEALTHY, "health check disabled");
    } else if (brokenLock) {
        statusInfo = new StatusInfo(UNHEALTHY, "health check failed due to broken node lock");
    } else if (checkInProgress.get() && currentTimeMillisSupplier.getAsLong() - lastRunStartTimeMillis.get() > healthyTimeoutThreshold.millis()) {
        statusInfo = new StatusInfo(UNHEALTHY, "healthy threshold breached");
    } else if (unhealthyPaths == null) {
        statusInfo = new StatusInfo(HEALTHY, "health check passed");
    } else {
        String info = "health check failed on [" + unhealthyPaths.stream().map(k -> k.toString()).collect(Collectors.joining(",")) + "]";
        statusInfo = new StatusInfo(UNHEALTHY, info);
    }
    return statusInfo;
}
Also used : Path(java.nio.file.Path) StatusInfo(org.opensearch.monitor.StatusInfo)

Example 25 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class FsHealthServiceTests method testFailsHealthOnHungIOBeyondHealthyTimeout.

public void testFailsHealthOnHungIOBeyondHealthyTimeout() throws Exception {
    long healthyTimeoutThreshold = randomLongBetween(500, 1000);
    long refreshInterval = randomLongBetween(500, 1000);
    long slowLogThreshold = randomLongBetween(100, 200);
    long delayBetweenChecks = 100;
    final Settings settings = Settings.builder().put(FsHealthService.HEALTHY_TIMEOUT_SETTING.getKey(), healthyTimeoutThreshold + "ms").put(FsHealthService.REFRESH_INTERVAL_SETTING.getKey(), refreshInterval + "ms").put(FsHealthService.SLOW_PATH_LOGGING_THRESHOLD_SETTING.getKey(), slowLogThreshold + "ms").put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), // we need to verify exact time
    0).build();
    FileSystem fileSystem = PathUtils.getDefaultFileSystem();
    TestThreadPool testThreadPool = new TestThreadPool(getClass().getName(), settings);
    FileSystemFsyncHungProvider disruptFileSystemProvider = new FileSystemFsyncHungProvider(fileSystem, testThreadPool);
    fileSystem = disruptFileSystemProvider.getFileSystem(null);
    PathUtilsForTesting.installMock(fileSystem);
    final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
    try (NodeEnvironment env = newNodeEnvironment()) {
        FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
        logger.info("--> Initial health status prior to the first monitor run");
        StatusInfo fsHealth = fsHealthService.getHealth();
        assertEquals(HEALTHY, fsHealth.getStatus());
        assertEquals("health check passed", fsHealth.getInfo());
        logger.info("--> First monitor run");
        fsHealthService.new FsHealthMonitor().run();
        fsHealth = fsHealthService.getHealth();
        assertEquals(HEALTHY, fsHealth.getStatus());
        assertEquals("health check passed", fsHealth.getInfo());
        logger.info("--> Disrupt file system");
        disruptFileSystemProvider.injectIODelay.set(true);
        final FsHealthService fsHealthSrvc = new FsHealthService(settings, clusterSettings, testThreadPool, env);
        fsHealthSrvc.doStart();
        waitUntil(() -> fsHealthSrvc.getHealth().getStatus() == UNHEALTHY, healthyTimeoutThreshold + (2 * refreshInterval), TimeUnit.MILLISECONDS);
        fsHealth = fsHealthSrvc.getHealth();
        assertEquals(UNHEALTHY, fsHealth.getStatus());
        assertEquals("healthy threshold breached", fsHealth.getInfo());
        int disruptedPathCount = disruptFileSystemProvider.getInjectedPathCount();
        assertThat(disruptedPathCount, equalTo(1));
        logger.info("--> Fix file system disruption");
        disruptFileSystemProvider.injectIODelay.set(false);
        waitUntil(() -> fsHealthSrvc.getHealth().getStatus() == HEALTHY, delayBetweenChecks + (4 * refreshInterval), TimeUnit.MILLISECONDS);
        fsHealth = fsHealthSrvc.getHealth();
        assertEquals(HEALTHY, fsHealth.getStatus());
        assertEquals("health check passed", fsHealth.getInfo());
        assertEquals(disruptedPathCount, disruptFileSystemProvider.getInjectedPathCount());
        fsHealthSrvc.doStop();
    } finally {
        PathUtilsForTesting.teardown();
        ThreadPool.terminate(testThreadPool, 500, TimeUnit.MILLISECONDS);
    }
}
Also used : ClusterSettings(org.opensearch.common.settings.ClusterSettings) NodeEnvironment(org.opensearch.env.NodeEnvironment) StatusInfo(org.opensearch.monitor.StatusInfo) FileSystem(java.nio.file.FileSystem) TestThreadPool(org.opensearch.threadpool.TestThreadPool) ClusterSettings(org.opensearch.common.settings.ClusterSettings) Settings(org.opensearch.common.settings.Settings)

Aggregations

StatusInfo (org.opensearch.monitor.StatusInfo)45 DiscoveryNode (org.opensearch.cluster.node.DiscoveryNode)33 Settings (org.opensearch.common.settings.Settings)18 TransportService (org.opensearch.transport.TransportService)14 ClusterState (org.opensearch.cluster.ClusterState)11 AtomicReference (java.util.concurrent.atomic.AtomicReference)10 TransportRequest (org.opensearch.transport.TransportRequest)10 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)9 MockTransport (org.opensearch.test.transport.MockTransport)9 ClusterFormationState (org.opensearch.cluster.coordination.ClusterFormationFailureHelper.ClusterFormationState)8 DiscoveryNodes (org.opensearch.cluster.node.DiscoveryNodes)7 ConnectTransportException (org.opensearch.transport.ConnectTransportException)6 TransportException (org.opensearch.transport.TransportException)6 TransportResponse (org.opensearch.transport.TransportResponse)6 Empty (org.opensearch.transport.TransportResponse.Empty)6 HashSet (java.util.HashSet)5 VotingConfiguration (org.opensearch.cluster.coordination.CoordinationMetadata.VotingConfiguration)5 TransportAddress (org.opensearch.common.transport.TransportAddress)5 CapturingTransport (org.opensearch.test.transport.CapturingTransport)5 Set (java.util.Set)4