Search in sources :

Example 31 with ClusterInfo

use of org.opensearch.cluster.ClusterInfo in project OpenSearch by opensearch-project.

the class DiskThresholdMonitorTests method testMarkFloodStageIndicesReadOnly.

public void testMarkFloodStageIndicesReadOnly() {
    AllocationService allocation = createAllocationService(Settings.builder().put("cluster.routing.allocation.node_concurrent_recoveries", 10).build());
    Metadata metadata = Metadata.builder().put(IndexMetadata.builder("test").settings(settings(Version.CURRENT).put("index.routing.allocation.require._id", "node2")).numberOfShards(1).numberOfReplicas(0)).put(IndexMetadata.builder("test_1").settings(settings(Version.CURRENT).put("index.routing.allocation.require._id", "node1")).numberOfShards(1).numberOfReplicas(0)).put(IndexMetadata.builder("test_2").settings(settings(Version.CURRENT).put("index.routing.allocation.require._id", "node1")).numberOfShards(1).numberOfReplicas(0)).build();
    RoutingTable routingTable = RoutingTable.builder().addAsNew(metadata.index("test")).addAsNew(metadata.index("test_1")).addAsNew(metadata.index("test_2")).build();
    final ClusterState clusterState = applyStartedShardsUntilNoChange(ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata).routingTable(routingTable).nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))).build(), allocation);
    AtomicBoolean reroute = new AtomicBoolean(false);
    AtomicReference<Set<String>> indices = new AtomicReference<>();
    AtomicLong currentTime = new AtomicLong();
    DiskThresholdMonitor monitor = new DiskThresholdMonitor(Settings.EMPTY, () -> clusterState, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), null, currentTime::get, (reason, priority, listener) -> {
        assertTrue(reroute.compareAndSet(false, true));
        assertThat(priority, equalTo(Priority.HIGH));
        listener.onResponse(null);
    }) {

        @Override
        protected void updateIndicesReadOnly(Set<String> indicesToMarkReadOnly, ActionListener<Void> listener, boolean readOnly) {
            assertTrue(indices.compareAndSet(null, indicesToMarkReadOnly));
            assertTrue(readOnly);
            listener.onResponse(null);
        }
    };
    ImmutableOpenMap.Builder<String, DiskUsage> builder = ImmutableOpenMap.builder();
    builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4));
    builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 30));
    monitor.onNewInfo(clusterInfo(builder.build()));
    assertFalse(reroute.get());
    assertEquals(new HashSet<>(Arrays.asList("test_1", "test_2")), indices.get());
    indices.set(null);
    builder = ImmutableOpenMap.builder();
    builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4));
    builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 5));
    currentTime.addAndGet(randomLongBetween(60001, 120000));
    monitor.onNewInfo(clusterInfo(builder.build()));
    assertTrue(reroute.get());
    assertEquals(new HashSet<>(Arrays.asList("test_1", "test_2")), indices.get());
    IndexMetadata indexMetadata = IndexMetadata.builder(clusterState.metadata().index("test_2")).settings(Settings.builder().put(clusterState.metadata().index("test_2").getSettings()).put(IndexMetadata.INDEX_BLOCKS_READ_ONLY_ALLOW_DELETE_SETTING.getKey(), true)).build();
    // now we mark one index as read-only and assert that we don't mark it as such again
    final ClusterState anotherFinalClusterState = ClusterState.builder(clusterState).metadata(Metadata.builder(clusterState.metadata()).put(clusterState.metadata().index("test"), false).put(clusterState.metadata().index("test_1"), false).put(indexMetadata, true).build()).blocks(ClusterBlocks.builder().addBlocks(indexMetadata).build()).build();
    assertTrue(anotherFinalClusterState.blocks().indexBlocked(ClusterBlockLevel.WRITE, "test_2"));
    monitor = new DiskThresholdMonitor(Settings.EMPTY, () -> anotherFinalClusterState, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), null, currentTime::get, (reason, priority, listener) -> {
        assertTrue(reroute.compareAndSet(false, true));
        assertThat(priority, equalTo(Priority.HIGH));
        listener.onResponse(null);
    }) {

        @Override
        protected void updateIndicesReadOnly(Set<String> indicesToMarkReadOnly, ActionListener<Void> listener, boolean readOnly) {
            assertTrue(indices.compareAndSet(null, indicesToMarkReadOnly));
            assertTrue(readOnly);
            listener.onResponse(null);
        }
    };
    indices.set(null);
    reroute.set(false);
    builder = ImmutableOpenMap.builder();
    builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4));
    builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 5));
    monitor.onNewInfo(clusterInfo(builder.build()));
    assertTrue(reroute.get());
    assertEquals(Collections.singleton("test_1"), indices.get());
}
Also used : DiscoveryNodes(org.opensearch.cluster.node.DiscoveryNodes) ImmutableOpenMap(org.opensearch.common.collect.ImmutableOpenMap) MockLogAppender(org.opensearch.test.MockLogAppender) Arrays(java.util.Arrays) Metadata(org.opensearch.cluster.metadata.Metadata) LongSupplier(java.util.function.LongSupplier) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) Level(org.apache.logging.log4j.Level) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Version(org.opensearch.Version) Priority(org.opensearch.common.Priority) AtomicReference(java.util.concurrent.atomic.AtomicReference) HashSet(java.util.HashSet) ClusterState(org.opensearch.cluster.ClusterState) OpenSearchAllocationTestCase(org.opensearch.cluster.OpenSearchAllocationTestCase) ShardRoutingState(org.opensearch.cluster.routing.ShardRoutingState) ActionListener(org.opensearch.action.ActionListener) DiskUsage(org.opensearch.cluster.DiskUsage) ClusterSettings(org.opensearch.common.settings.ClusterSettings) ClusterBlocks(org.opensearch.cluster.block.ClusterBlocks) ClusterInfo(org.opensearch.cluster.ClusterInfo) ClusterBlockLevel(org.opensearch.cluster.block.ClusterBlockLevel) Set(java.util.Set) Settings(org.opensearch.common.settings.Settings) ShardId(org.opensearch.index.shard.ShardId) TestLogging(org.opensearch.test.junit.annotations.TestLogging) AtomicLong(java.util.concurrent.atomic.AtomicLong) Matchers.contains(org.hamcrest.Matchers.contains) Matchers.equalTo(org.hamcrest.Matchers.equalTo) ClusterName(org.opensearch.cluster.ClusterName) RoutingTable(org.opensearch.cluster.routing.RoutingTable) RoutingNode(org.opensearch.cluster.routing.RoutingNode) LogManager(org.apache.logging.log4j.LogManager) Collections(java.util.Collections) HashSet(java.util.HashSet) Set(java.util.Set) ClusterSettings(org.opensearch.common.settings.ClusterSettings) Metadata(org.opensearch.cluster.metadata.Metadata) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) DiskUsage(org.opensearch.cluster.DiskUsage) ImmutableOpenMap(org.opensearch.common.collect.ImmutableOpenMap) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) ClusterState(org.opensearch.cluster.ClusterState) AtomicReference(java.util.concurrent.atomic.AtomicReference) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AtomicLong(java.util.concurrent.atomic.AtomicLong) RoutingTable(org.opensearch.cluster.routing.RoutingTable) ActionListener(org.opensearch.action.ActionListener)

Example 32 with ClusterInfo

use of org.opensearch.cluster.ClusterInfo in project OpenSearch by opensearch-project.

the class DiskThresholdMonitorTests method testDoesNotSubmitRerouteTaskTooFrequently.

public void testDoesNotSubmitRerouteTaskTooFrequently() {
    final ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))).build();
    AtomicLong currentTime = new AtomicLong();
    AtomicReference<ActionListener<ClusterState>> listenerReference = new AtomicReference<>();
    DiskThresholdMonitor monitor = new DiskThresholdMonitor(Settings.EMPTY, () -> clusterState, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), null, currentTime::get, (reason, priority, listener) -> {
        assertNotNull(listener);
        assertThat(priority, equalTo(Priority.HIGH));
        assertTrue(listenerReference.compareAndSet(null, listener));
    }) {

        @Override
        protected void updateIndicesReadOnly(Set<String> indicesToMarkReadOnly, ActionListener<Void> listener, boolean readOnly) {
            throw new AssertionError("unexpected");
        }
    };
    final ImmutableOpenMap.Builder<String, DiskUsage> allDisksOkBuilder;
    allDisksOkBuilder = ImmutableOpenMap.builder();
    allDisksOkBuilder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 50));
    allDisksOkBuilder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 50));
    final ImmutableOpenMap<String, DiskUsage> allDisksOk = allDisksOkBuilder.build();
    final ImmutableOpenMap.Builder<String, DiskUsage> oneDiskAboveWatermarkBuilder = ImmutableOpenMap.builder();
    oneDiskAboveWatermarkBuilder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 9)));
    oneDiskAboveWatermarkBuilder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 50));
    final ImmutableOpenMap<String, DiskUsage> oneDiskAboveWatermark = oneDiskAboveWatermarkBuilder.build();
    // should not reroute when all disks are ok
    currentTime.addAndGet(randomLongBetween(0, 120000));
    monitor.onNewInfo(clusterInfo(allDisksOk));
    assertNull(listenerReference.get());
    // should reroute when one disk goes over the watermark
    currentTime.addAndGet(randomLongBetween(0, 120000));
    monitor.onNewInfo(clusterInfo(oneDiskAboveWatermark));
    assertNotNull(listenerReference.get());
    listenerReference.getAndSet(null).onResponse(clusterState);
    if (randomBoolean()) {
        // should not re-route again within the reroute interval
        currentTime.addAndGet(randomLongBetween(0, DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.get(Settings.EMPTY).millis()));
        monitor.onNewInfo(clusterInfo(allDisksOk));
        assertNull(listenerReference.get());
    }
    // should reroute again when one disk is still over the watermark
    currentTime.addAndGet(randomLongBetween(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.get(Settings.EMPTY).millis() + 1, 120000));
    monitor.onNewInfo(clusterInfo(oneDiskAboveWatermark));
    assertNotNull(listenerReference.get());
    final ActionListener<ClusterState> rerouteListener1 = listenerReference.getAndSet(null);
    // should not re-route again before reroute has completed
    currentTime.addAndGet(randomLongBetween(0, 120000));
    monitor.onNewInfo(clusterInfo(allDisksOk));
    assertNull(listenerReference.get());
    // complete reroute
    rerouteListener1.onResponse(clusterState);
    if (randomBoolean()) {
        // should not re-route again within the reroute interval
        currentTime.addAndGet(randomLongBetween(0, DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.get(Settings.EMPTY).millis()));
        monitor.onNewInfo(clusterInfo(allDisksOk));
        assertNull(listenerReference.get());
    }
    // should reroute again after the reroute interval
    currentTime.addAndGet(randomLongBetween(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.get(Settings.EMPTY).millis() + 1, 120000));
    monitor.onNewInfo(clusterInfo(allDisksOk));
    assertNotNull(listenerReference.get());
    listenerReference.getAndSet(null).onResponse(null);
    // should not reroute again when it is not required
    currentTime.addAndGet(randomLongBetween(0, 120000));
    monitor.onNewInfo(clusterInfo(allDisksOk));
    assertNull(listenerReference.get());
    // should reroute again when one disk has reserved space that pushes it over the high watermark
    final ImmutableOpenMap.Builder<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> builder = ImmutableOpenMap.builder(1);
    builder.put(new ClusterInfo.NodeAndPath("node1", "/foo/bar"), new ClusterInfo.ReservedSpace.Builder().add(new ShardId("baz", "quux", 0), between(41, 100)).build());
    final ImmutableOpenMap<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> reservedSpaces = builder.build();
    currentTime.addAndGet(randomLongBetween(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.get(Settings.EMPTY).millis() + 1, 120000));
    monitor.onNewInfo(clusterInfo(allDisksOk, reservedSpaces));
    assertNotNull(listenerReference.get());
    listenerReference.getAndSet(null).onResponse(null);
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) ClusterSettings(org.opensearch.common.settings.ClusterSettings) HashSet(java.util.HashSet) Set(java.util.Set) AtomicReference(java.util.concurrent.atomic.AtomicReference) DiskUsage(org.opensearch.cluster.DiskUsage) ImmutableOpenMap(org.opensearch.common.collect.ImmutableOpenMap) ShardId(org.opensearch.index.shard.ShardId) AtomicLong(java.util.concurrent.atomic.AtomicLong) ClusterInfo(org.opensearch.cluster.ClusterInfo) ActionListener(org.opensearch.action.ActionListener)

Example 33 with ClusterInfo

use of org.opensearch.cluster.ClusterInfo in project OpenSearch by opensearch-project.

the class IndexShardConstraintDeciderOverlapTests method testHighWatermarkBreachWithLowShardCount.

/**
 * High watermark breach blocks new shard allocations to affected nodes. If shard count on such
 * nodes is low, this will cause IndexShardPerNodeConstraint to breach.
 *
 * This test verifies that this doesn't lead to unassigned shards, and there are no hot spots in eligible
 * nodes.
 */
public void testHighWatermarkBreachWithLowShardCount() {
    setupInitialCluster(3, 15, 10, 1);
    addNodesWithIndexing(1, "high_watermark_node_", 6, 5, 1);
    // Disk threshold settings enabled
    Settings settings = Settings.builder().put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true).put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.7).put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.8).put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), 0.95).put("cluster.routing.allocation.node_concurrent_recoveries", 1).put("cluster.routing.allocation.cluster_concurrent_recoveries", 1).build();
    // Build Shard size and disk usages
    ImmutableOpenMap.Builder<String, DiskUsage> usagesBuilder = ImmutableOpenMap.builder();
    // 20% used
    usagesBuilder.put("node_0", new DiskUsage("node_0", "node_0", "/dev/null", 100, 80));
    // 45% used
    usagesBuilder.put("node_1", new DiskUsage("node_1", "node_1", "/dev/null", 100, 55));
    // 65% used
    usagesBuilder.put("node_2", new DiskUsage("node_2", "node_2", "/dev/null", 100, 35));
    // 90%
    usagesBuilder.put("high_watermark_node_0", new DiskUsage("high_watermark_node_0", "high_watermark_node_0", "/dev/null", 100, 10));
    // used
    ImmutableOpenMap<String, DiskUsage> usages = usagesBuilder.build();
    ImmutableOpenMap.Builder<String, Long> shardSizesBuilder = ImmutableOpenMap.builder();
    // Each
    clusterState.getRoutingTable().allShards().forEach(shard -> shardSizesBuilder.put(shardIdentifierFromRouting(shard), 1L));
    // shard
    // is 1
    // byte
    ImmutableOpenMap<String, Long> shardSizes = shardSizesBuilder.build();
    final ImmutableOpenMap<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> reservedSpace = new ImmutableOpenMap.Builder<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace>().fPut(getNodeAndDevNullPath("node_0"), getReservedSpace()).fPut(getNodeAndDevNullPath("node_1"), getReservedSpace()).fPut(getNodeAndDevNullPath("node_2"), getReservedSpace()).fPut(getNodeAndDevNullPath("high_watermark_node_0"), getReservedSpace()).build();
    final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes, reservedSpace);
    ClusterInfoService cis = () -> clusterInfo;
    allocation = createAllocationService(settings, cis);
    allocateAndCheckIndexShardHotSpots(false, 3, "node_0", "node_1", "node_2");
    assertForIndexShardHotSpots(true, 4);
    assertTrue(clusterState.getRoutingTable().shardsWithState(UNASSIGNED).isEmpty());
    assertTrue(clusterState.getRoutingNodes().node("high_watermark_node_0").isEmpty());
    /* Shard sizes that would breach high watermark on node_2 if allocated.
         */
    addIndices("big_index_", 1, 10, 0);
    ImmutableOpenMap.Builder<String, Long> bigIndexShardSizeBuilder = ImmutableOpenMap.builder(shardSizes);
    clusterState.getRoutingNodes().unassigned().forEach(shard -> bigIndexShardSizeBuilder.put(shardIdentifierFromRouting(shard), 20L));
    shardSizes = bigIndexShardSizeBuilder.build();
    final ClusterInfo bigIndexClusterInfo = new DevNullClusterInfo(usages, usages, shardSizes, reservedSpace);
    cis = () -> bigIndexClusterInfo;
    allocation = createAllocationService(settings, cis);
    allocateAndCheckIndexShardHotSpots(false, 2, "node_0", "node_1");
    assertForIndexShardHotSpots(true, 4);
    assertTrue(clusterState.getRoutingTable().shardsWithState(UNASSIGNED).isEmpty());
    for (ShardRouting shard : clusterState.getRoutingTable().index("big_index_0").shardsWithState(STARTED)) {
        assertNotEquals("node_2", shard.currentNodeId());
    }
}
Also used : ClusterInfoService(org.opensearch.cluster.ClusterInfoService) DiskUsage(org.opensearch.cluster.DiskUsage) ImmutableOpenMap(org.opensearch.common.collect.ImmutableOpenMap) ClusterInfo(org.opensearch.cluster.ClusterInfo) ShardRouting(org.opensearch.cluster.routing.ShardRouting) Settings(org.opensearch.common.settings.Settings)

Example 34 with ClusterInfo

use of org.opensearch.cluster.ClusterInfo in project OpenSearch by opensearch-project.

the class ExpectedShardSizeAllocationTests method testExpectedSizeOnMove.

public void testExpectedSizeOnMove() {
    final long byteSize = randomIntBetween(0, Integer.MAX_VALUE);
    final AllocationService allocation = createAllocationService(Settings.EMPTY, () -> new ClusterInfo() {

        @Override
        public Long getShardSize(ShardRouting shardRouting) {
            if (shardRouting.getIndexName().equals("test") && shardRouting.shardId().getId() == 0) {
                return byteSize;
            }
            return null;
        }
    });
    logger.info("creating an index with 1 shard, no replica");
    Metadata metadata = Metadata.builder().put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(0)).build();
    RoutingTable routingTable = RoutingTable.builder().addAsNew(metadata.index("test")).build();
    ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata).routingTable(routingTable).build();
    logger.info("adding two nodes and performing rerouting");
    clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))).build();
    clusterState = allocation.reroute(clusterState, "reroute");
    logger.info("start primary shard");
    clusterState = startInitializingShardsAndReroute(allocation, clusterState);
    logger.info("move the shard");
    String existingNodeId = clusterState.routingTable().index("test").shard(0).primaryShard().currentNodeId();
    String toNodeId;
    if ("node1".equals(existingNodeId)) {
        toNodeId = "node2";
    } else {
        toNodeId = "node1";
    }
    AllocationService.CommandsResult commandsResult = allocation.reroute(clusterState, new AllocationCommands(new MoveAllocationCommand("test", 0, existingNodeId, toNodeId)), false, false);
    assertThat(commandsResult.getClusterState(), not(equalTo(clusterState)));
    clusterState = commandsResult.getClusterState();
    assertEquals(clusterState.getRoutingNodes().node(existingNodeId).iterator().next().state(), ShardRoutingState.RELOCATING);
    assertEquals(clusterState.getRoutingNodes().node(toNodeId).iterator().next().state(), ShardRoutingState.INITIALIZING);
    assertEquals(clusterState.getRoutingNodes().node(existingNodeId).iterator().next().getExpectedShardSize(), byteSize);
    assertEquals(clusterState.getRoutingNodes().node(toNodeId).iterator().next().getExpectedShardSize(), byteSize);
    logger.info("finish moving the shard");
    clusterState = startInitializingShardsAndReroute(allocation, clusterState);
    assertThat(clusterState.getRoutingNodes().node(existingNodeId).isEmpty(), equalTo(true));
    assertThat(clusterState.getRoutingNodes().node(toNodeId).iterator().next().state(), equalTo(ShardRoutingState.STARTED));
    assertEquals(clusterState.getRoutingNodes().node(toNodeId).iterator().next().getExpectedShardSize(), -1);
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) Metadata(org.opensearch.cluster.metadata.Metadata) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) MoveAllocationCommand(org.opensearch.cluster.routing.allocation.command.MoveAllocationCommand) AllocationCommands(org.opensearch.cluster.routing.allocation.command.AllocationCommands) ClusterInfo(org.opensearch.cluster.ClusterInfo) RoutingTable(org.opensearch.cluster.routing.RoutingTable) ShardRouting(org.opensearch.cluster.routing.ShardRouting)

Example 35 with ClusterInfo

use of org.opensearch.cluster.ClusterInfo in project OpenSearch by opensearch-project.

the class DiskThresholdDecider method canAllocate.

@Override
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
    ClusterInfo clusterInfo = allocation.clusterInfo();
    ImmutableOpenMap<String, DiskUsage> usages = clusterInfo.getNodeMostAvailableDiskUsages();
    final Decision decision = earlyTerminate(allocation, usages);
    if (decision != null) {
        return decision;
    }
    final double usedDiskThresholdLow = 100.0 - diskThresholdSettings.getFreeDiskThresholdLow();
    final double usedDiskThresholdHigh = 100.0 - diskThresholdSettings.getFreeDiskThresholdHigh();
    // subtractLeavingShards is passed as false here, because they still use disk space, and therefore we should be extra careful
    // and take the size into account
    final DiskUsageWithRelocations usage = getDiskUsage(node, allocation, usages, false);
    // First, check that the node currently over the low watermark
    double freeDiskPercentage = usage.getFreeDiskAsPercentage();
    // Cache the used disk percentage for displaying disk percentages consistent with documentation
    double usedDiskPercentage = usage.getUsedDiskAsPercentage();
    long freeBytes = usage.getFreeBytes();
    if (freeBytes < 0L) {
        final long sizeOfRelocatingShards = sizeOfRelocatingShards(node, false, usage.getPath(), allocation.clusterInfo(), allocation.metadata(), allocation.routingTable());
        logger.debug("fewer free bytes remaining than the size of all incoming shards: " + "usage {} on node {} including {} bytes of relocations, preventing allocation", usage, node.nodeId(), sizeOfRelocatingShards);
        return allocation.decision(Decision.NO, NAME, "the node has fewer free bytes remaining than the total size of all incoming shards: " + "free space [%sB], relocating shards [%sB]", freeBytes + sizeOfRelocatingShards, sizeOfRelocatingShards);
    }
    ByteSizeValue freeBytesValue = new ByteSizeValue(freeBytes);
    if (logger.isTraceEnabled()) {
        logger.trace("node [{}] has {}% used disk", node.nodeId(), usedDiskPercentage);
    }
    // flag that determines whether the low threshold checks below can be skipped. We use this for a primary shard that is freshly
    // allocated and empty.
    boolean skipLowThresholdChecks = shardRouting.primary() && shardRouting.active() == false && shardRouting.recoverySource().getType() == RecoverySource.Type.EMPTY_STORE;
    // checks for exact byte comparisons
    if (freeBytes < diskThresholdSettings.getFreeBytesThresholdLow().getBytes()) {
        if (skipLowThresholdChecks == false) {
            if (logger.isDebugEnabled()) {
                logger.debug("less than the required {} free bytes threshold ({} free) on node {}, preventing allocation", diskThresholdSettings.getFreeBytesThresholdLow(), freeBytesValue, node.nodeId());
            }
            return allocation.decision(Decision.NO, NAME, "the node is above the low watermark cluster setting [%s=%s], having less than the minimum required [%s] free " + "space, actual free: [%s]", CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), diskThresholdSettings.getLowWatermarkRaw(), diskThresholdSettings.getFreeBytesThresholdLow(), freeBytesValue);
        } else if (freeBytes > diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) {
            // has never been allocated if it's under the high watermark
            if (logger.isDebugEnabled()) {
                logger.debug("less than the required {} free bytes threshold ({} free) on node {}, " + "but allowing allocation because primary has never been allocated", diskThresholdSettings.getFreeBytesThresholdLow(), freeBytesValue, node.nodeId());
            }
            return allocation.decision(Decision.YES, NAME, "the node is above the low watermark, but less than the high watermark, and this primary shard has " + "never been allocated before");
        } else {
            // above the high watermark, so don't allow allocating the shard
            if (logger.isDebugEnabled()) {
                logger.debug("less than the required {} free bytes threshold ({} free) on node {}, " + "preventing allocation even though primary has never been allocated", diskThresholdSettings.getFreeBytesThresholdHigh(), freeBytesValue, node.nodeId());
            }
            return allocation.decision(Decision.NO, NAME, "the node is above the high watermark cluster setting [%s=%s], having less than the minimum required [%s] free " + "space, actual free: [%s]", CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), diskThresholdSettings.getHighWatermarkRaw(), diskThresholdSettings.getFreeBytesThresholdHigh(), freeBytesValue);
        }
    }
    // checks for percentage comparisons
    if (freeDiskPercentage < diskThresholdSettings.getFreeDiskThresholdLow()) {
        // If the shard is a replica or is a non-empty primary, check the low threshold
        if (skipLowThresholdChecks == false) {
            if (logger.isDebugEnabled()) {
                logger.debug("more than the allowed {} used disk threshold ({} used) on node [{}], preventing allocation", Strings.format1Decimals(usedDiskThresholdLow, "%"), Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId());
            }
            return allocation.decision(Decision.NO, NAME, "the node is above the low watermark cluster setting [%s=%s], using more disk space than the maximum allowed " + "[%s%%], actual free: [%s%%]", CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), diskThresholdSettings.getLowWatermarkRaw(), usedDiskThresholdLow, freeDiskPercentage);
        } else if (freeDiskPercentage > diskThresholdSettings.getFreeDiskThresholdHigh()) {
            // has never been allocated if it's under the high watermark
            if (logger.isDebugEnabled()) {
                logger.debug("more than the allowed {} used disk threshold ({} used) on node [{}], " + "but allowing allocation because primary has never been allocated", Strings.format1Decimals(usedDiskThresholdLow, "%"), Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId());
            }
            return allocation.decision(Decision.YES, NAME, "the node is above the low watermark, but less than the high watermark, and this primary shard has " + "never been allocated before");
        } else {
            // above the high watermark, so don't allow allocating the shard
            if (logger.isDebugEnabled()) {
                logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, " + "preventing allocation even though primary has never been allocated", Strings.format1Decimals(diskThresholdSettings.getFreeDiskThresholdHigh(), "%"), Strings.format1Decimals(freeDiskPercentage, "%"), node.nodeId());
            }
            return allocation.decision(Decision.NO, NAME, "the node is above the high watermark cluster setting [%s=%s], using more disk space than the maximum allowed " + "[%s%%], actual free: [%s%%]", CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), diskThresholdSettings.getHighWatermarkRaw(), usedDiskThresholdHigh, freeDiskPercentage);
        }
    }
    // Secondly, check that allocating the shard to this node doesn't put it above the high watermark
    final long shardSize = getExpectedShardSize(shardRouting, 0L, allocation.clusterInfo(), allocation.snapshotShardSizeInfo(), allocation.metadata(), allocation.routingTable());
    assert shardSize >= 0 : shardSize;
    double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned(usage, shardSize);
    long freeBytesAfterShard = freeBytes - shardSize;
    if (freeBytesAfterShard < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) {
        logger.warn("after allocating [{}] node [{}] would have less than the required threshold of " + "{} free (currently {} free, estimated shard size is {}), preventing allocation", shardRouting, node.nodeId(), diskThresholdSettings.getFreeBytesThresholdHigh(), freeBytesValue, new ByteSizeValue(shardSize));
        return allocation.decision(Decision.NO, NAME, "allocating the shard to this node will bring the node above the high watermark cluster setting [%s=%s] " + "and cause it to have less than the minimum required [%s] of free space (free: [%s], estimated shard size: [%s])", CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), diskThresholdSettings.getHighWatermarkRaw(), diskThresholdSettings.getFreeBytesThresholdHigh(), freeBytesValue, new ByteSizeValue(shardSize));
    }
    if (freeSpaceAfterShard < diskThresholdSettings.getFreeDiskThresholdHigh()) {
        logger.warn("after allocating [{}] node [{}] would have more than the allowed " + "{} free disk threshold ({} free), preventing allocation", shardRouting, node.nodeId(), Strings.format1Decimals(diskThresholdSettings.getFreeDiskThresholdHigh(), "%"), Strings.format1Decimals(freeSpaceAfterShard, "%"));
        return allocation.decision(Decision.NO, NAME, "allocating the shard to this node will bring the node above the high watermark cluster setting [%s=%s] " + "and cause it to use more disk space than the maximum allowed [%s%%] (free space after shard added: [%s%%])", CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), diskThresholdSettings.getHighWatermarkRaw(), usedDiskThresholdHigh, freeSpaceAfterShard);
    }
    assert freeBytesAfterShard >= 0 : freeBytesAfterShard;
    return allocation.decision(Decision.YES, NAME, "enough disk for shard on node, free: [%s], shard size: [%s], free after allocating shard: [%s]", freeBytesValue, new ByteSizeValue(shardSize), new ByteSizeValue(freeBytesAfterShard));
}
Also used : ClusterInfo(org.opensearch.cluster.ClusterInfo) ByteSizeValue(org.opensearch.common.unit.ByteSizeValue) DiskUsage(org.opensearch.cluster.DiskUsage)

Aggregations

ClusterInfo (org.opensearch.cluster.ClusterInfo)35 ClusterState (org.opensearch.cluster.ClusterState)21 DiskUsage (org.opensearch.cluster.DiskUsage)19 IndexMetadata (org.opensearch.cluster.metadata.IndexMetadata)19 ImmutableOpenMap (org.opensearch.common.collect.ImmutableOpenMap)19 ShardId (org.opensearch.index.shard.ShardId)19 Matchers.containsString (org.hamcrest.Matchers.containsString)18 Metadata (org.opensearch.cluster.metadata.Metadata)18 RoutingTable (org.opensearch.cluster.routing.RoutingTable)18 DiscoveryNode (org.opensearch.cluster.node.DiscoveryNode)16 ShardRouting (org.opensearch.cluster.routing.ShardRouting)16 ClusterSettings (org.opensearch.common.settings.ClusterSettings)16 UnassignedInfo (org.opensearch.cluster.routing.UnassignedInfo)14 Settings (org.opensearch.common.settings.Settings)13 RoutingNode (org.opensearch.cluster.routing.RoutingNode)11 ShardRoutingState (org.opensearch.cluster.routing.ShardRoutingState)11 ClusterInfoService (org.opensearch.cluster.ClusterInfoService)9 AllocateUnassignedDecision (org.opensearch.cluster.routing.allocation.AllocateUnassignedDecision)9 AllocationService (org.opensearch.cluster.routing.allocation.AllocationService)9 MoveDecision (org.opensearch.cluster.routing.allocation.MoveDecision)9