use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class ClusterFormationFailureHelperTests method testDescriptionAfterDetachCluster.
public void testDescriptionAfterDetachCluster() {
final DiscoveryNode localNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), Version.CURRENT);
final ClusterState clusterState = state(localNode, VotingConfiguration.MUST_JOIN_ELECTED_MASTER.getNodeIds().toArray(new String[0]));
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet and this node was detached from its previous cluster, " + "have discovered []; " + "discovery will continue using [] from hosts providers and [" + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
final TransportAddress otherAddress = buildNewFakeTransportAddress();
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet and this node was detached from its previous cluster, " + "have discovered []; " + "discovery will continue using [" + otherAddress + "] from hosts providers and [" + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
final DiscoveryNode otherNode = new DiscoveryNode("otherNode", buildNewFakeTransportAddress(), Version.CURRENT);
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet and this node was detached from its previous cluster, " + "have discovered [" + otherNode + "]; " + "discovery will continue using [] from hosts providers and [" + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
final DiscoveryNode yetAnotherNode = new DiscoveryNode("yetAnotherNode", buildNewFakeTransportAddress(), Version.CURRENT);
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(yetAnotherNode), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet and this node was detached from its previous cluster, " + "have discovered [" + yetAnotherNode + "]; " + "discovery will continue using [] from hosts providers and [" + localNode + "] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class ClusterFormationFailureHelperTests method testDescriptionForBWCState.
public void testDescriptionForBWCState() {
final DiscoveryNode localNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
final ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metadata(Metadata.builder().version(// check that we use metadata version in case of BWC term 0
42L).coordinationMetadata(CoordinationMetadata.builder().term(Coordinator.ZEN1_BWC_TERM).build()).build()).nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 15L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet: have discovered []; discovery will continue using [] from hosts providers " + "and [] from last-known cluster state; node term 15, last-accepted version 42 in term 0"));
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class ClusterFormationFailureHelperTests method testScheduling.
public void testScheduling() {
final long expectedDelayMillis;
final Settings.Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
expectedDelayMillis = ClusterFormationFailureHelper.DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING.get(Settings.EMPTY).millis();
} else {
expectedDelayMillis = randomLongBetween(100, 100000);
settingsBuilder.put(ClusterFormationFailureHelper.DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING.getKey(), expectedDelayMillis + "ms");
}
final DiscoveryNode localNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
final ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(Settings.builder().put(NODE_NAME_SETTING.getKey(), "node").build(), random());
final AtomicLong warningCount = new AtomicLong();
final AtomicLong logLastFailedJoinAttemptWarningCount = new AtomicLong();
final ClusterFormationFailureHelper clusterFormationFailureHelper = new ClusterFormationFailureHelper(settingsBuilder.build(), () -> {
warningCount.incrementAndGet();
return new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info"));
}, deterministicTaskQueue.getThreadPool(), logLastFailedJoinAttemptWarningCount::incrementAndGet);
deterministicTaskQueue.runAllTasks();
assertThat("should not schedule anything yet", warningCount.get(), is(0L));
final long startTimeMillis = deterministicTaskQueue.getCurrentTimeMillis();
clusterFormationFailureHelper.start();
while (warningCount.get() == 0) {
assertTrue(clusterFormationFailureHelper.isRunning());
if (deterministicTaskQueue.hasRunnableTasks()) {
deterministicTaskQueue.runRandomTask();
} else {
deterministicTaskQueue.advanceTime();
}
}
assertThat(warningCount.get(), is(1L));
assertThat(deterministicTaskQueue.getCurrentTimeMillis() - startTimeMillis, is(expectedDelayMillis));
while (warningCount.get() < 5) {
assertTrue(clusterFormationFailureHelper.isRunning());
if (deterministicTaskQueue.hasRunnableTasks()) {
deterministicTaskQueue.runRandomTask();
} else {
deterministicTaskQueue.advanceTime();
}
}
assertThat(deterministicTaskQueue.getCurrentTimeMillis() - startTimeMillis, equalTo(5 * expectedDelayMillis));
clusterFormationFailureHelper.stop();
assertFalse(clusterFormationFailureHelper.isRunning());
deterministicTaskQueue.runAllTasksInTimeOrder();
assertThat(warningCount.get(), is(5L));
assertThat(logLastFailedJoinAttemptWarningCount.get(), is(5L));
warningCount.set(0);
logLastFailedJoinAttemptWarningCount.set(0);
clusterFormationFailureHelper.start();
clusterFormationFailureHelper.stop();
clusterFormationFailureHelper.start();
final long secondStartTimeMillis = deterministicTaskQueue.getCurrentTimeMillis();
while (warningCount.get() < 5) {
assertTrue(clusterFormationFailureHelper.isRunning());
if (deterministicTaskQueue.hasRunnableTasks()) {
deterministicTaskQueue.runRandomTask();
} else {
deterministicTaskQueue.advanceTime();
}
}
assertThat(deterministicTaskQueue.getCurrentTimeMillis() - secondStartTimeMillis, equalTo(5 * expectedDelayMillis));
clusterFormationFailureHelper.stop();
assertFalse(clusterFormationFailureHelper.isRunning());
deterministicTaskQueue.runAllTasksInTimeOrder();
assertThat(warningCount.get(), is(5L));
assertThat(logLastFailedJoinAttemptWarningCount.get(), is(5L));
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class FsHealthService method getHealth.
@Override
public StatusInfo getHealth() {
StatusInfo statusInfo;
Set<Path> unhealthyPaths = this.unhealthyPaths;
if (enabled == false) {
statusInfo = new StatusInfo(HEALTHY, "health check disabled");
} else if (brokenLock) {
statusInfo = new StatusInfo(UNHEALTHY, "health check failed due to broken node lock");
} else if (checkInProgress.get() && currentTimeMillisSupplier.getAsLong() - lastRunStartTimeMillis.get() > healthyTimeoutThreshold.millis()) {
statusInfo = new StatusInfo(UNHEALTHY, "healthy threshold breached");
} else if (unhealthyPaths == null) {
statusInfo = new StatusInfo(HEALTHY, "health check passed");
} else {
String info = "health check failed on [" + unhealthyPaths.stream().map(k -> k.toString()).collect(Collectors.joining(",")) + "]";
statusInfo = new StatusInfo(UNHEALTHY, info);
}
return statusInfo;
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class FsHealthServiceTests method testFailsHealthOnHungIOBeyondHealthyTimeout.
public void testFailsHealthOnHungIOBeyondHealthyTimeout() throws Exception {
long healthyTimeoutThreshold = randomLongBetween(500, 1000);
long refreshInterval = randomLongBetween(500, 1000);
long slowLogThreshold = randomLongBetween(100, 200);
long delayBetweenChecks = 100;
final Settings settings = Settings.builder().put(FsHealthService.HEALTHY_TIMEOUT_SETTING.getKey(), healthyTimeoutThreshold + "ms").put(FsHealthService.REFRESH_INTERVAL_SETTING.getKey(), refreshInterval + "ms").put(FsHealthService.SLOW_PATH_LOGGING_THRESHOLD_SETTING.getKey(), slowLogThreshold + "ms").put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), // we need to verify exact time
0).build();
FileSystem fileSystem = PathUtils.getDefaultFileSystem();
TestThreadPool testThreadPool = new TestThreadPool(getClass().getName(), settings);
FileSystemFsyncHungProvider disruptFileSystemProvider = new FileSystemFsyncHungProvider(fileSystem, testThreadPool);
fileSystem = disruptFileSystemProvider.getFileSystem(null);
PathUtilsForTesting.installMock(fileSystem);
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
try (NodeEnvironment env = newNodeEnvironment()) {
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
logger.info("--> Initial health status prior to the first monitor run");
StatusInfo fsHealth = fsHealthService.getHealth();
assertEquals(HEALTHY, fsHealth.getStatus());
assertEquals("health check passed", fsHealth.getInfo());
logger.info("--> First monitor run");
fsHealthService.new FsHealthMonitor().run();
fsHealth = fsHealthService.getHealth();
assertEquals(HEALTHY, fsHealth.getStatus());
assertEquals("health check passed", fsHealth.getInfo());
logger.info("--> Disrupt file system");
disruptFileSystemProvider.injectIODelay.set(true);
final FsHealthService fsHealthSrvc = new FsHealthService(settings, clusterSettings, testThreadPool, env);
fsHealthSrvc.doStart();
waitUntil(() -> fsHealthSrvc.getHealth().getStatus() == UNHEALTHY, healthyTimeoutThreshold + (2 * refreshInterval), TimeUnit.MILLISECONDS);
fsHealth = fsHealthSrvc.getHealth();
assertEquals(UNHEALTHY, fsHealth.getStatus());
assertEquals("healthy threshold breached", fsHealth.getInfo());
int disruptedPathCount = disruptFileSystemProvider.getInjectedPathCount();
assertThat(disruptedPathCount, equalTo(1));
logger.info("--> Fix file system disruption");
disruptFileSystemProvider.injectIODelay.set(false);
waitUntil(() -> fsHealthSrvc.getHealth().getStatus() == HEALTHY, delayBetweenChecks + (4 * refreshInterval), TimeUnit.MILLISECONDS);
fsHealth = fsHealthSrvc.getHealth();
assertEquals(HEALTHY, fsHealth.getStatus());
assertEquals("health check passed", fsHealth.getInfo());
assertEquals(disruptedPathCount, disruptFileSystemProvider.getInjectedPathCount());
fsHealthSrvc.doStop();
} finally {
PathUtilsForTesting.teardown();
ThreadPool.terminate(testThreadPool, 500, TimeUnit.MILLISECONDS);
}
}
Aggregations