use of org.opensearch.cluster.ClusterInfo in project OpenSearch by opensearch-project.
the class DiskThresholdMonitorTests method testDiskMonitorLogging.
@TestLogging(value = "org.opensearch.cluster.routing.allocation.DiskThresholdMonitor:INFO", reason = "testing INFO/WARN logging")
public void testDiskMonitorLogging() throws IllegalAccessException {
final ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build();
final AtomicReference<ClusterState> clusterStateRef = new AtomicReference<>(clusterState);
final AtomicBoolean advanceTime = new AtomicBoolean(randomBoolean());
final LongSupplier timeSupplier = new LongSupplier() {
long time;
@Override
public long getAsLong() {
if (advanceTime.get()) {
time += DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.get(Settings.EMPTY).getMillis() + 1;
}
logger.info("time: [{}]", time);
return time;
}
};
final AtomicLong relocatingShardSizeRef = new AtomicLong();
DiskThresholdMonitor monitor = new DiskThresholdMonitor(Settings.EMPTY, clusterStateRef::get, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), null, timeSupplier, (reason, priority, listener) -> listener.onResponse(clusterStateRef.get())) {
@Override
protected void updateIndicesReadOnly(Set<String> indicesToMarkReadOnly, ActionListener<Void> listener, boolean readOnly) {
listener.onResponse(null);
}
@Override
long sizeOfRelocatingShards(RoutingNode routingNode, DiskUsage diskUsage, ClusterInfo info, ClusterState reroutedClusterState) {
return relocatingShardSizeRef.get();
}
};
final ImmutableOpenMap.Builder<String, DiskUsage> allDisksOkBuilder;
allDisksOkBuilder = ImmutableOpenMap.builder();
allDisksOkBuilder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(15, 100)));
final ImmutableOpenMap<String, DiskUsage> allDisksOk = allDisksOkBuilder.build();
final ImmutableOpenMap.Builder<String, DiskUsage> aboveLowWatermarkBuilder = ImmutableOpenMap.builder();
aboveLowWatermarkBuilder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(10, 14)));
final ImmutableOpenMap<String, DiskUsage> aboveLowWatermark = aboveLowWatermarkBuilder.build();
final ImmutableOpenMap.Builder<String, DiskUsage> aboveHighWatermarkBuilder = ImmutableOpenMap.builder();
aboveHighWatermarkBuilder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 9)));
final ImmutableOpenMap<String, DiskUsage> aboveHighWatermark = aboveHighWatermarkBuilder.build();
final ImmutableOpenMap.Builder<String, DiskUsage> aboveFloodStageWatermarkBuilder = ImmutableOpenMap.builder();
aboveFloodStageWatermarkBuilder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4)));
final ImmutableOpenMap<String, DiskUsage> aboveFloodStageWatermark = aboveFloodStageWatermarkBuilder.build();
assertNoLogging(monitor, allDisksOk);
assertSingleInfoMessage(monitor, aboveLowWatermark, "low disk watermark [85%] exceeded on * replicas will not be assigned to this node");
// will do one reroute and emit warnings, but subsequent reroutes and associated messages are delayed
advanceTime.set(false);
assertSingleWarningMessage(monitor, aboveHighWatermark, "high disk watermark [90%] exceeded on * shards will be relocated away from this node* " + "the node is expected to continue to exceed the high disk watermark when these relocations are complete");
advanceTime.set(true);
assertRepeatedWarningMessages(monitor, aboveHighWatermark, "high disk watermark [90%] exceeded on * shards will be relocated away from this node* " + "the node is expected to continue to exceed the high disk watermark when these relocations are complete");
advanceTime.set(randomBoolean());
assertRepeatedWarningMessages(monitor, aboveFloodStageWatermark, "flood stage disk watermark [95%] exceeded on * all indices on this node will be marked read-only");
relocatingShardSizeRef.set(-5L);
advanceTime.set(true);
assertSingleInfoMessage(monitor, aboveHighWatermark, "high disk watermark [90%] exceeded on * shards will be relocated away from this node* " + "the node is expected to be below the high disk watermark when these relocations are complete");
relocatingShardSizeRef.set(0L);
// advance time long enough to do another reroute
timeSupplier.getAsLong();
// will do one reroute and emit warnings, but subsequent reroutes and associated messages are delayed
advanceTime.set(false);
assertSingleWarningMessage(monitor, aboveHighWatermark, "high disk watermark [90%] exceeded on * shards will be relocated away from this node* " + "the node is expected to continue to exceed the high disk watermark when these relocations are complete");
advanceTime.set(true);
assertRepeatedWarningMessages(monitor, aboveHighWatermark, "high disk watermark [90%] exceeded on * shards will be relocated away from this node* " + "the node is expected to continue to exceed the high disk watermark when these relocations are complete");
advanceTime.set(randomBoolean());
assertSingleInfoMessage(monitor, aboveLowWatermark, "high disk watermark [90%] no longer exceeded on * but low disk watermark [85%] is still exceeded");
// only log about dropping below the low disk watermark on a reroute
advanceTime.set(true);
assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *");
advanceTime.set(randomBoolean());
assertRepeatedWarningMessages(monitor, aboveFloodStageWatermark, "flood stage disk watermark [95%] exceeded on * all indices on this node will be marked read-only");
assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *");
advanceTime.set(true);
assertRepeatedWarningMessages(monitor, aboveHighWatermark, "high disk watermark [90%] exceeded on * shards will be relocated away from this node* " + "the node is expected to continue to exceed the high disk watermark when these relocations are complete");
assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *");
assertRepeatedWarningMessages(monitor, aboveFloodStageWatermark, "flood stage disk watermark [95%] exceeded on * all indices on this node will be marked read-only");
assertSingleInfoMessage(monitor, aboveLowWatermark, "high disk watermark [90%] no longer exceeded on * but low disk watermark [85%] is still exceeded");
}
use of org.opensearch.cluster.ClusterInfo in project OpenSearch by opensearch-project.
the class DiskThresholdMonitorTests method testAutoReleaseIndices.
public void testAutoReleaseIndices() {
AtomicReference<Set<String>> indicesToMarkReadOnly = new AtomicReference<>();
AtomicReference<Set<String>> indicesToRelease = new AtomicReference<>();
AllocationService allocation = createAllocationService(Settings.builder().put("cluster.routing.allocation.node_concurrent_recoveries", 10).build());
Metadata metadata = Metadata.builder().put(IndexMetadata.builder("test_1").settings(settings(Version.CURRENT)).numberOfShards(2).numberOfReplicas(1)).put(IndexMetadata.builder("test_2").settings(settings(Version.CURRENT)).numberOfShards(2).numberOfReplicas(1)).build();
RoutingTable routingTable = RoutingTable.builder().addAsNew(metadata.index("test_1")).addAsNew(metadata.index("test_2")).build();
final ClusterState clusterState = applyStartedShardsUntilNoChange(ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata).routingTable(routingTable).nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))).build(), allocation);
assertThat(clusterState.getRoutingTable().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(8));
final ImmutableOpenMap.Builder<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> reservedSpacesBuilder = ImmutableOpenMap.builder();
final int reservedSpaceNode1 = between(0, 10);
reservedSpacesBuilder.put(new ClusterInfo.NodeAndPath("node1", "/foo/bar"), new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), reservedSpaceNode1).build());
final int reservedSpaceNode2 = between(0, 10);
reservedSpacesBuilder.put(new ClusterInfo.NodeAndPath("node2", "/foo/bar"), new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), reservedSpaceNode2).build());
ImmutableOpenMap<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> reservedSpaces = reservedSpacesBuilder.build();
DiskThresholdMonitor monitor = new DiskThresholdMonitor(Settings.EMPTY, () -> clusterState, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), null, () -> 0L, (reason, priority, listener) -> {
assertNotNull(listener);
assertThat(priority, equalTo(Priority.HIGH));
listener.onResponse(clusterState);
}) {
@Override
protected void updateIndicesReadOnly(Set<String> indicesToUpdate, ActionListener<Void> listener, boolean readOnly) {
if (readOnly) {
assertTrue(indicesToMarkReadOnly.compareAndSet(null, indicesToUpdate));
} else {
assertTrue(indicesToRelease.compareAndSet(null, indicesToUpdate));
}
listener.onResponse(null);
}
};
indicesToMarkReadOnly.set(null);
indicesToRelease.set(null);
ImmutableOpenMap.Builder<String, DiskUsage> builder = ImmutableOpenMap.builder();
builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4)));
builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(0, 4)));
monitor.onNewInfo(clusterInfo(builder.build(), reservedSpaces));
assertEquals(new HashSet<>(Arrays.asList("test_1", "test_2")), indicesToMarkReadOnly.get());
assertNull(indicesToRelease.get());
// Reserved space is ignored when applying block
indicesToMarkReadOnly.set(null);
indicesToRelease.set(null);
builder = ImmutableOpenMap.builder();
builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 90)));
builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(5, 90)));
monitor.onNewInfo(clusterInfo(builder.build(), reservedSpaces));
assertNull(indicesToMarkReadOnly.get());
assertNull(indicesToRelease.get());
// Change cluster state so that "test_2" index is blocked (read only)
IndexMetadata indexMetadata = IndexMetadata.builder(clusterState.metadata().index("test_2")).settings(Settings.builder().put(clusterState.metadata().index("test_2").getSettings()).put(IndexMetadata.INDEX_BLOCKS_READ_ONLY_ALLOW_DELETE_SETTING.getKey(), true)).build();
ClusterState clusterStateWithBlocks = ClusterState.builder(clusterState).metadata(Metadata.builder(clusterState.metadata()).put(indexMetadata, true).build()).blocks(ClusterBlocks.builder().addBlocks(indexMetadata).build()).build();
assertTrue(clusterStateWithBlocks.blocks().indexBlocked(ClusterBlockLevel.WRITE, "test_2"));
monitor = new DiskThresholdMonitor(Settings.EMPTY, () -> clusterStateWithBlocks, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), null, () -> 0L, (reason, priority, listener) -> {
assertNotNull(listener);
assertThat(priority, equalTo(Priority.HIGH));
listener.onResponse(clusterStateWithBlocks);
}) {
@Override
protected void updateIndicesReadOnly(Set<String> indicesToUpdate, ActionListener<Void> listener, boolean readOnly) {
if (readOnly) {
assertTrue(indicesToMarkReadOnly.compareAndSet(null, indicesToUpdate));
} else {
assertTrue(indicesToRelease.compareAndSet(null, indicesToUpdate));
}
listener.onResponse(null);
}
};
// When free disk on any of node1 or node2 goes below 5% flood watermark, then apply index block on indices not having the block
indicesToMarkReadOnly.set(null);
indicesToRelease.set(null);
builder = ImmutableOpenMap.builder();
builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 100)));
builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(0, 4)));
monitor.onNewInfo(clusterInfo(builder.build(), reservedSpaces));
assertThat(indicesToMarkReadOnly.get(), contains("test_1"));
assertNull(indicesToRelease.get());
// When free disk on node1 and node2 goes above 10% high watermark then release index block, ignoring reserved space
indicesToMarkReadOnly.set(null);
indicesToRelease.set(null);
builder = ImmutableOpenMap.builder();
builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(10, 100)));
builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(10, 100)));
monitor.onNewInfo(clusterInfo(builder.build(), reservedSpaces));
assertNull(indicesToMarkReadOnly.get());
assertThat(indicesToRelease.get(), contains("test_2"));
// When no usage information is present for node2, we don't release the block
indicesToMarkReadOnly.set(null);
indicesToRelease.set(null);
builder = ImmutableOpenMap.builder();
builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4)));
monitor.onNewInfo(clusterInfo(builder.build()));
assertThat(indicesToMarkReadOnly.get(), contains("test_1"));
assertNull(indicesToRelease.get());
// When disk usage on one node is between the high and flood-stage watermarks, nothing changes
indicesToMarkReadOnly.set(null);
indicesToRelease.set(null);
builder = ImmutableOpenMap.builder();
builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 9)));
builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(5, 100)));
if (randomBoolean()) {
builder.put("node3", new DiskUsage("node3", "node3", "/foo/bar", 100, between(0, 100)));
}
monitor.onNewInfo(clusterInfo(builder.build()));
assertNull(indicesToMarkReadOnly.get());
assertNull(indicesToRelease.get());
// When disk usage on one node is missing and the other is below the high watermark, nothing changes
indicesToMarkReadOnly.set(null);
indicesToRelease.set(null);
builder = ImmutableOpenMap.builder();
builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 100)));
if (randomBoolean()) {
builder.put("node3", new DiskUsage("node3", "node3", "/foo/bar", 100, between(0, 100)));
}
monitor.onNewInfo(clusterInfo(builder.build()));
assertNull(indicesToMarkReadOnly.get());
assertNull(indicesToRelease.get());
// When disk usage on one node is missing and the other is above the flood-stage watermark, affected indices are blocked
indicesToMarkReadOnly.set(null);
indicesToRelease.set(null);
builder = ImmutableOpenMap.builder();
builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4)));
if (randomBoolean()) {
builder.put("node3", new DiskUsage("node3", "node3", "/foo/bar", 100, between(0, 100)));
}
monitor.onNewInfo(clusterInfo(builder.build()));
assertThat(indicesToMarkReadOnly.get(), contains("test_1"));
assertNull(indicesToRelease.get());
}
use of org.opensearch.cluster.ClusterInfo in project OpenSearch by opensearch-project.
the class ExpectedShardSizeAllocationTests method testInitializingHasExpectedSize.
public void testInitializingHasExpectedSize() {
final long byteSize = randomIntBetween(0, Integer.MAX_VALUE);
AllocationService strategy = createAllocationService(Settings.EMPTY, () -> new ClusterInfo() {
@Override
public Long getShardSize(ShardRouting shardRouting) {
if (shardRouting.getIndexName().equals("test") && shardRouting.shardId().getId() == 0) {
return byteSize;
}
return null;
}
});
logger.info("Building initial routing table");
Metadata metadata = Metadata.builder().put(IndexMetadata.builder("test").settings(settings(Version.CURRENT).put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1))).build();
RoutingTable routingTable = RoutingTable.builder().addAsNew(metadata.index("test")).build();
ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata).routingTable(routingTable).build();
logger.info("Adding one node and performing rerouting");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build();
clusterState = strategy.reroute(clusterState, "reroute");
assertEquals(1, clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(ShardRoutingState.INITIALIZING));
assertEquals(byteSize, clusterState.getRoutingTable().shardsWithState(ShardRoutingState.INITIALIZING).get(0).getExpectedShardSize());
logger.info("Start the primary shard");
clusterState = startInitializingShardsAndReroute(strategy, clusterState);
assertEquals(1, clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(ShardRoutingState.STARTED));
assertEquals(1, clusterState.getRoutingNodes().unassigned().size());
logger.info("Add another one node and reroute");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build();
clusterState = strategy.reroute(clusterState, "reroute");
assertEquals(1, clusterState.getRoutingNodes().node("node2").numberOfShardsWithState(ShardRoutingState.INITIALIZING));
assertEquals(byteSize, clusterState.getRoutingTable().shardsWithState(ShardRoutingState.INITIALIZING).get(0).getExpectedShardSize());
}
use of org.opensearch.cluster.ClusterInfo in project OpenSearch by opensearch-project.
the class DiskThresholdDeciderTests method testDiskThresholdWithAbsoluteSizes.
public void testDiskThresholdWithAbsoluteSizes() {
Settings diskSettings = Settings.builder().put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true).put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "30b").put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "9b").put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "5b").build();
ImmutableOpenMap.Builder<String, DiskUsage> usagesBuilder = ImmutableOpenMap.builder();
// 90% used
usagesBuilder.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 10));
// 90% used
usagesBuilder.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 10));
// 40% used
usagesBuilder.put("node3", new DiskUsage("node3", "n3", "/dev/null", 100, 60));
// 20% used
usagesBuilder.put("node4", new DiskUsage("node4", "n4", "/dev/null", 100, 80));
// 15% used
usagesBuilder.put("node5", new DiskUsage("node5", "n5", "/dev/null", 100, 85));
ImmutableOpenMap<String, DiskUsage> usages = usagesBuilder.build();
ImmutableOpenMap.Builder<String, Long> shardSizesBuilder = ImmutableOpenMap.builder();
// 10 bytes
shardSizesBuilder.put("[test][0][p]", 10L);
shardSizesBuilder.put("[test][0][r]", 10L);
ImmutableOpenMap<String, Long> shardSizes = shardSizesBuilder.build();
final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes);
ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
AllocationDeciders deciders = new AllocationDeciders(new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings))));
ClusterInfoService cis = () -> {
logger.info("--> calling fake getClusterInfo");
return clusterInfo;
};
AllocationService strategy = new AllocationService(deciders, new TestGatewayAllocator(), new BalancedShardsAllocator(Settings.EMPTY), cis, EmptySnapshotsInfoService.INSTANCE);
Metadata metadata = Metadata.builder().put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(2)).build();
RoutingTable initialRoutingTable = RoutingTable.builder().addAsNew(metadata.index("test")).build();
ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build();
logger.info("--> adding node1 and node2 node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))).build();
clusterState = strategy.reroute(clusterState, "reroute");
logShardStates(clusterState);
// Primary should initialize, even though both nodes are over the limit initialize
assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(1));
String nodeWithPrimary, nodeWithoutPrimary;
if (clusterState.getRoutingNodes().node("node1").size() == 1) {
nodeWithPrimary = "node1";
nodeWithoutPrimary = "node2";
} else {
nodeWithPrimary = "node2";
nodeWithoutPrimary = "node1";
}
logger.info("--> nodeWithPrimary: {}", nodeWithPrimary);
logger.info("--> nodeWithoutPrimary: {}", nodeWithoutPrimary);
// Make node without the primary now habitable to replicas
usagesBuilder = ImmutableOpenMap.builder(usages);
// 65% used
usagesBuilder.put(nodeWithoutPrimary, new DiskUsage(nodeWithoutPrimary, "", "/dev/null", 100, 35));
usages = usagesBuilder.build();
final ClusterInfo clusterInfo2 = new DevNullClusterInfo(usages, usages, shardSizes);
cis = () -> {
logger.info("--> calling fake getClusterInfo");
return clusterInfo2;
};
strategy = new AllocationService(deciders, new TestGatewayAllocator(), new BalancedShardsAllocator(Settings.EMPTY), cis, EmptySnapshotsInfoService.INSTANCE);
clusterState = strategy.reroute(clusterState, "reroute");
logShardStates(clusterState);
// Now the replica should be able to initialize
assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(2));
logger.info("--> start the shards (primaries)");
clusterState = startInitializingShardsAndReroute(strategy, clusterState);
logShardStates(clusterState);
// Assert that we're able to start the primary and replica, since they were both initializing
assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(2));
// Assert that node1 got a single shard (the primary), even though its disk usage is too high
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1));
// Assert that node2 got a single shard (a replica)
assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(1));
// Assert that one replica is still unassigned
// assertThat(clusterState.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED).size(), equalTo(1));
logger.info("--> adding node3");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build();
clusterState = strategy.reroute(clusterState, "reroute");
logShardStates(clusterState);
// Assert that the replica is initialized now that node3 is available with enough space
assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(2));
assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1));
logger.info("--> start the shards (replicas)");
clusterState = startInitializingShardsAndReroute(strategy, clusterState);
logShardStates(clusterState);
// Assert that all replicas could be started
assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(3));
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(1));
logger.info("--> changing decider settings");
// Set the low threshold to 60 instead of 70
// Set the high threshold to 70 instead of 80
// node2 now should not have new shards allocated to it, but shards can remain
diskSettings = Settings.builder().put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true).put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "40b").put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "30b").put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "20b").build();
deciders = new AllocationDeciders(new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings))));
strategy = new AllocationService(deciders, new TestGatewayAllocator(), new BalancedShardsAllocator(Settings.EMPTY), cis, EmptySnapshotsInfoService.INSTANCE);
clusterState = strategy.reroute(clusterState, "reroute");
logShardStates(clusterState);
// Shards remain started
assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(3));
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(1));
logger.info("--> changing settings again");
// Set the low threshold to 50 instead of 60
// Set the high threshold to 60 instead of 70
// node2 now should not have new shards allocated to it, and shards cannot remain
diskSettings = Settings.builder().put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true).put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "50b").put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "40b").put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "30b").build();
deciders = new AllocationDeciders(new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings))));
strategy = new AllocationService(deciders, new TestGatewayAllocator(), new BalancedShardsAllocator(Settings.EMPTY), cis, EmptySnapshotsInfoService.INSTANCE);
clusterState = strategy.reroute(clusterState, "reroute");
logShardStates(clusterState);
// Shards remain started
assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(3));
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1));
// Shard hasn't been moved off of node2 yet because there's nowhere for it to go
assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(1));
logger.info("--> adding node4");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4"))).build();
clusterState = strategy.reroute(clusterState, "reroute");
logShardStates(clusterState);
// Shards remain started
assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(2));
// One shard is relocating off of node1
assertThat(clusterState.getRoutingNodes().shardsWithState(RELOCATING).size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(1));
logger.info("--> apply INITIALIZING shards");
clusterState = startInitializingShardsAndReroute(strategy, clusterState);
logShardStates(clusterState);
// primary shard already has been relocated away
assertThat(clusterState.getRoutingNodes().node(nodeWithPrimary).size(), equalTo(0));
// node with increased space still has its shard
assertThat(clusterState.getRoutingNodes().node(nodeWithoutPrimary).size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().node("node4").size(), equalTo(1));
logger.info("--> adding node5");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node5"))).build();
clusterState = strategy.reroute(clusterState, "reroute");
logShardStates(clusterState);
// Shards remain started on node3 and node4
assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(2));
// One shard is relocating off of node2 now
assertThat(clusterState.getRoutingNodes().shardsWithState(RELOCATING).size(), equalTo(1));
// Initializing on node5
assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(1));
logger.info("--> apply INITIALIZING shards");
clusterState = startInitializingShardsAndReroute(strategy, clusterState);
logger.info("--> final cluster state:");
logShardStates(clusterState);
// Node1 still has no shards because it has no space for them
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(0));
// Node5 is available now, so the shard is moved off of node2
assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(0));
assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().node("node4").size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().node("node5").size(), equalTo(1));
}
use of org.opensearch.cluster.ClusterInfo in project OpenSearch by opensearch-project.
the class DiskThresholdDeciderTests method testCanRemainWithShardRelocatingAway.
public void testCanRemainWithShardRelocatingAway() {
Settings diskSettings = Settings.builder().put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true).put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%").put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "70%").build();
// We have an index with 2 primary shards each taking 40 bytes. Each node has 100 bytes available
ImmutableOpenMap.Builder<String, DiskUsage> usagesBuilder = ImmutableOpenMap.builder();
// 80% used
usagesBuilder.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 20));
// 0% used
usagesBuilder.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 100));
ImmutableOpenMap<String, DiskUsage> usages = usagesBuilder.build();
ImmutableOpenMap.Builder<String, Long> shardSizesBuilder = ImmutableOpenMap.builder();
shardSizesBuilder.put("[test][0][p]", 40L);
shardSizesBuilder.put("[test][1][p]", 40L);
shardSizesBuilder.put("[foo][0][p]", 10L);
ImmutableOpenMap<String, Long> shardSizes = shardSizesBuilder.build();
final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes);
DiskThresholdDecider diskThresholdDecider = makeDecider(diskSettings);
Metadata metadata = Metadata.builder().put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(2).numberOfReplicas(0)).put(IndexMetadata.builder("foo").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(0)).build();
RoutingTable initialRoutingTable = RoutingTable.builder().addAsNew(metadata.index("test")).addAsNew(metadata.index("foo")).build();
DiscoveryNode discoveryNode1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), emptyMap(), CLUSTER_MANAGER_DATA_ROLES, Version.CURRENT);
DiscoveryNode discoveryNode2 = new DiscoveryNode("node2", buildNewFakeTransportAddress(), emptyMap(), CLUSTER_MANAGER_DATA_ROLES, Version.CURRENT);
DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().add(discoveryNode1).add(discoveryNode2).build();
ClusterState baseClusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).nodes(discoveryNodes).build();
// Two shards consuming each 80% of disk space while 70% is allowed, so shard 0 isn't allowed here
ShardRouting firstRouting = TestShardRouting.newShardRouting("test", 0, "node1", null, true, ShardRoutingState.STARTED);
ShardRouting secondRouting = TestShardRouting.newShardRouting("test", 1, "node1", null, true, ShardRoutingState.STARTED);
RoutingNode firstRoutingNode = new RoutingNode("node1", discoveryNode1, firstRouting, secondRouting);
RoutingTable.Builder builder = RoutingTable.builder().add(IndexRoutingTable.builder(firstRouting.index()).addIndexShard(new IndexShardRoutingTable.Builder(firstRouting.shardId()).addShard(firstRouting).build()).addIndexShard(new IndexShardRoutingTable.Builder(secondRouting.shardId()).addShard(secondRouting).build()));
ClusterState clusterState = ClusterState.builder(baseClusterState).routingTable(builder.build()).build();
RoutingAllocation routingAllocation = new RoutingAllocation(null, new RoutingNodes(clusterState), clusterState, clusterInfo, null, System.nanoTime());
routingAllocation.debugDecision(true);
Decision decision = diskThresholdDecider.canRemain(firstRouting, firstRoutingNode, routingAllocation);
assertThat(decision.type(), equalTo(Decision.Type.NO));
assertThat(((Decision.Single) decision).getExplanation(), containsString("the shard cannot remain on this node because it is above the high watermark cluster setting " + "[cluster.routing.allocation.disk.watermark.high=70%] and there is less than the required [30.0%] free disk on node, " + "actual free: [20.0%]"));
// Two shards consuming each 80% of disk space while 70% is allowed, but one is relocating, so shard 0 can stay
firstRouting = TestShardRouting.newShardRouting("test", 0, "node1", null, true, ShardRoutingState.STARTED);
secondRouting = TestShardRouting.newShardRouting("test", 1, "node1", "node2", true, ShardRoutingState.RELOCATING);
ShardRouting fooRouting = TestShardRouting.newShardRouting("foo", 0, null, true, ShardRoutingState.UNASSIGNED);
firstRoutingNode = new RoutingNode("node1", discoveryNode1, firstRouting, secondRouting);
builder = RoutingTable.builder().add(IndexRoutingTable.builder(firstRouting.index()).addIndexShard(new IndexShardRoutingTable.Builder(firstRouting.shardId()).addShard(firstRouting).build()).addIndexShard(new IndexShardRoutingTable.Builder(secondRouting.shardId()).addShard(secondRouting).build()));
clusterState = ClusterState.builder(baseClusterState).routingTable(builder.build()).build();
routingAllocation = new RoutingAllocation(null, new RoutingNodes(clusterState), clusterState, clusterInfo, null, System.nanoTime());
routingAllocation.debugDecision(true);
decision = diskThresholdDecider.canRemain(firstRouting, firstRoutingNode, routingAllocation);
assertThat(decision.type(), equalTo(Decision.Type.YES));
assertEquals("there is enough disk on this node for the shard to remain, free: [60b]", ((Decision.Single) decision).getExplanation());
decision = diskThresholdDecider.canAllocate(fooRouting, firstRoutingNode, routingAllocation);
assertThat(decision.type(), equalTo(Decision.Type.NO));
if (fooRouting.recoverySource().getType() == RecoverySource.Type.EMPTY_STORE) {
assertThat(((Decision.Single) decision).getExplanation(), containsString("the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark.high=70%], using " + "more disk space than the maximum allowed [70.0%], actual free: [20.0%]"));
} else {
assertThat(((Decision.Single) decision).getExplanation(), containsString("the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=60%], using more " + "disk space than the maximum allowed [60.0%], actual free: [20.0%]"));
}
// Creating AllocationService instance and the services it depends on...
ClusterInfoService cis = () -> {
logger.info("--> calling fake getClusterInfo");
return clusterInfo;
};
AllocationDeciders deciders = new AllocationDeciders(new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS)), diskThresholdDecider)));
AllocationService strategy = new AllocationService(deciders, new TestGatewayAllocator(), new BalancedShardsAllocator(Settings.EMPTY), cis, EmptySnapshotsInfoService.INSTANCE);
// Ensure that the reroute call doesn't alter the routing table, since the first primary is relocating away
// and therefor we will have sufficient disk space on node1.
ClusterState result = strategy.reroute(clusterState, "reroute");
assertThat(result, equalTo(clusterState));
assertThat(result.routingTable().index("test").getShards().get(0).primaryShard().state(), equalTo(STARTED));
assertThat(result.routingTable().index("test").getShards().get(0).primaryShard().currentNodeId(), equalTo("node1"));
assertThat(result.routingTable().index("test").getShards().get(0).primaryShard().relocatingNodeId(), nullValue());
assertThat(result.routingTable().index("test").getShards().get(1).primaryShard().state(), equalTo(RELOCATING));
assertThat(result.routingTable().index("test").getShards().get(1).primaryShard().currentNodeId(), equalTo("node1"));
assertThat(result.routingTable().index("test").getShards().get(1).primaryShard().relocatingNodeId(), equalTo("node2"));
}
Aggregations