use of org.opensearch.index.seqno.RetentionLeases in project OpenSearch by opensearch-project.
the class SoftDeletesPolicy method getMinRetainedSeqNo.
/**
* Returns the min seqno that is retained in the Lucene index.
* Operations whose seq# is least this value should exist in the Lucene index.
*/
synchronized long getMinRetainedSeqNo() {
/*
* When an engine is flushed, we need to provide it the latest collection of retention leases even when the soft deletes policy is
* locked for peer recovery.
*/
final RetentionLeases retentionLeases = retentionLeasesSupplier.get();
// do not advance if the retention lock is held
if (retentionLockCount == 0) {
/*
* This policy retains operations for two purposes: peer-recovery and querying changes history.
* - Peer-recovery is driven by the local checkpoint of the safe commit. In peer-recovery, the primary transfers a safe commit,
* then sends operations after the local checkpoint of that commit. This requires keeping all ops after
* localCheckpointOfSafeCommit.
* - Changes APIs are driven by a combination of the global checkpoint, retention operations, and retention leases. Here we
* prefer using the global checkpoint instead of the maximum sequence number because only operations up to the global
* checkpoint are exposed in the changes APIs.
*/
// calculate the minimum sequence number to retain based on retention leases
final long minimumRetainingSequenceNumber = retentionLeases.leases().stream().mapToLong(RetentionLease::retainingSequenceNumber).min().orElse(Long.MAX_VALUE);
/*
* The minimum sequence number to retain is the minimum of the minimum based on retention leases, and the number of operations
* below the global checkpoint to retain (index.soft_deletes.retention.operations). The additional increments on the global
* checkpoint and the local checkpoint of the safe commit are due to the fact that we want to retain all operations above
* those checkpoints.
*/
final long minSeqNoForQueryingChanges = Math.min(1 + globalCheckpointSupplier.getAsLong() - retentionOperations, minimumRetainingSequenceNumber);
final long minSeqNoToRetain = Math.min(minSeqNoForQueryingChanges, 1 + localCheckpointOfSafeCommit);
/*
* We take the maximum as minSeqNoToRetain can go backward as the retention operations value can be changed in settings, or from
* the addition of leases with a retaining sequence number lower than previous retaining sequence numbers.
*/
minRetainedSeqNo = Math.max(minRetainedSeqNo, minSeqNoToRetain);
}
return minRetainedSeqNo;
}
use of org.opensearch.index.seqno.RetentionLeases in project OpenSearch by opensearch-project.
the class IndexRecoveryIT method testUsesFileBasedRecoveryIfOperationsBasedRecoveryWouldBeUnreasonable.
public void testUsesFileBasedRecoveryIfOperationsBasedRecoveryWouldBeUnreasonable() throws Exception {
internalCluster().ensureAtLeastNumDataNodes(2);
String indexName = "test-index";
final Settings.Builder settings = Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1).put(IndexSettings.INDEX_SOFT_DELETES_SETTING.getKey(), true).put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "12h").put(IndexService.RETENTION_LEASE_SYNC_INTERVAL_SETTING.getKey(), "100ms");
final double reasonableOperationsBasedRecoveryProportion;
if (randomBoolean()) {
reasonableOperationsBasedRecoveryProportion = randomDoubleBetween(0.05, 0.99, true);
settings.put(IndexSettings.FILE_BASED_RECOVERY_THRESHOLD_SETTING.getKey(), reasonableOperationsBasedRecoveryProportion);
} else {
reasonableOperationsBasedRecoveryProportion = IndexSettings.FILE_BASED_RECOVERY_THRESHOLD_SETTING.get(Settings.EMPTY);
}
logger.info("--> performing ops-based recoveries up to [{}%] of docs", reasonableOperationsBasedRecoveryProportion * 100.0);
createIndex(indexName, settings.build());
indexRandom(randomBoolean(), false, randomBoolean(), IntStream.range(0, between(0, 100)).mapToObj(n -> client().prepareIndex(indexName).setSource("num", n)).collect(toList()));
ensureGreen(indexName);
flush(indexName);
// wait for all history to be discarded
assertBusy(() -> {
for (ShardStats shardStats : client().admin().indices().prepareStats(indexName).get().getShards()) {
final long maxSeqNo = shardStats.getSeqNoStats().getMaxSeqNo();
assertTrue(shardStats.getRetentionLeaseStats().retentionLeases() + " should discard history up to " + maxSeqNo, shardStats.getRetentionLeaseStats().retentionLeases().leases().stream().allMatch(l -> l.retainingSequenceNumber() == maxSeqNo + 1));
}
});
// ensure that all operations are in the safe commit
flush(indexName);
final ShardStats shardStats = client().admin().indices().prepareStats(indexName).get().getShards()[0];
final long docCount = shardStats.getStats().docs.getCount();
assertThat(shardStats.getStats().docs.getDeleted(), equalTo(0L));
assertThat(shardStats.getSeqNoStats().getMaxSeqNo() + 1, equalTo(docCount));
final ShardId shardId = new ShardId(resolveIndex(indexName), 0);
final DiscoveryNodes discoveryNodes = clusterService().state().nodes();
final IndexShardRoutingTable indexShardRoutingTable = clusterService().state().routingTable().shardRoutingTable(shardId);
final ShardRouting replicaShardRouting = indexShardRoutingTable.replicaShards().get(0);
assertTrue("should have lease for " + replicaShardRouting, client().admin().indices().prepareStats(indexName).get().getShards()[0].getRetentionLeaseStats().retentionLeases().contains(ReplicationTracker.getPeerRecoveryRetentionLeaseId(replicaShardRouting)));
internalCluster().restartNode(discoveryNodes.get(replicaShardRouting.currentNodeId()).getName(), new InternalTestCluster.RestartCallback() {
@Override
public Settings onNodeStopped(String nodeName) throws Exception {
assertFalse(client().admin().cluster().prepareHealth().setWaitForNodes(Integer.toString(discoveryNodes.getSize() - 1)).setWaitForEvents(Priority.LANGUID).get().isTimedOut());
final int newDocCount = Math.toIntExact(Math.round(Math.ceil((1 + Math.ceil(docCount * reasonableOperationsBasedRecoveryProportion)) / (1 - reasonableOperationsBasedRecoveryProportion))));
/*
* newDocCount >= (ceil(docCount * p) + 1) / (1-p)
*
* ==> 0 <= newDocCount * (1-p) - ceil(docCount * p) - 1
* = newDocCount - (newDocCount * p + ceil(docCount * p) + 1)
* < newDocCount - (ceil(newDocCount * p) + ceil(docCount * p))
* <= newDocCount - ceil(newDocCount * p + docCount * p)
*
* ==> docCount < newDocCount + docCount - ceil((newDocCount + docCount) * p)
* == localCheckpoint + 1 - ceil((newDocCount + docCount) * p)
* == firstReasonableSeqNo
*
* The replica has docCount docs, i.e. has operations with seqnos [0..docCount-1], so a seqno-based recovery will start
* from docCount < firstReasonableSeqNo
*
* ==> it is unreasonable to recover the replica using a seqno-based recovery
*/
indexRandom(randomBoolean(), randomBoolean(), randomBoolean(), IntStream.range(0, newDocCount).mapToObj(n -> client().prepareIndex(indexName).setSource("num", n)).collect(toList()));
flush(indexName);
assertBusy(() -> assertFalse("should no longer have lease for " + replicaShardRouting, client().admin().indices().prepareStats(indexName).get().getShards()[0].getRetentionLeaseStats().retentionLeases().contains(ReplicationTracker.getPeerRecoveryRetentionLeaseId(replicaShardRouting))));
return super.onNodeStopped(nodeName);
}
});
ensureGreen(indexName);
// noinspection OptionalGetWithoutIsPresent because it fails the test if absent
final RecoveryState recoveryState = client().admin().indices().prepareRecoveries(indexName).get().shardRecoveryStates().get(indexName).stream().filter(rs -> rs.getPrimary() == false).findFirst().get();
assertThat(recoveryState.getIndex().totalFileCount(), greaterThan(0));
}
use of org.opensearch.index.seqno.RetentionLeases in project OpenSearch by opensearch-project.
the class IndexRecoveryIT method testRecoverLocallyUpToGlobalCheckpoint.
public void testRecoverLocallyUpToGlobalCheckpoint() throws Exception {
internalCluster().ensureAtLeastNumDataNodes(2);
List<String> nodes = randomSubsetOf(2, StreamSupport.stream(clusterService().state().nodes().getDataNodes().spliterator(), false).map(node -> node.value.getName()).collect(Collectors.toSet()));
String indexName = "test-index";
createIndex(indexName, Settings.builder().put("index.number_of_shards", 1).put("index.number_of_replicas", 1).put(IndexService.GLOBAL_CHECKPOINT_SYNC_INTERVAL_SETTING.getKey(), "12h").put("index.routing.allocation.include._name", String.join(",", nodes)).build());
ensureGreen(indexName);
int numDocs = randomIntBetween(0, 100);
indexRandom(randomBoolean(), false, randomBoolean(), IntStream.range(0, numDocs).mapToObj(n -> client().prepareIndex(indexName).setSource("num", n)).collect(toList()));
// avoid refresh when we are failing a shard
client().admin().indices().prepareRefresh(indexName).get();
String failingNode = randomFrom(nodes);
PlainActionFuture<StartRecoveryRequest> startRecoveryRequestFuture = new PlainActionFuture<>();
// Peer recovery fails if the primary does not see the recovering replica in the replication group (when the cluster state
// update on the primary is delayed). To verify the local recovery stats, we have to manually remember this value in the
// first try because the local recovery happens once and its stats is reset when the recovery fails.
SetOnce<Integer> localRecoveredOps = new SetOnce<>();
for (String node : nodes) {
MockTransportService transportService = (MockTransportService) internalCluster().getInstance(TransportService.class, node);
transportService.addSendBehavior((connection, requestId, action, request, options) -> {
if (action.equals(PeerRecoverySourceService.Actions.START_RECOVERY)) {
final RecoveryState recoveryState = internalCluster().getInstance(IndicesService.class, failingNode).getShardOrNull(new ShardId(resolveIndex(indexName), 0)).recoveryState();
assertThat(recoveryState.getTranslog().recoveredOperations(), equalTo(recoveryState.getTranslog().totalLocal()));
if (startRecoveryRequestFuture.isDone()) {
assertThat(recoveryState.getTranslog().totalLocal(), equalTo(0));
recoveryState.getTranslog().totalLocal(localRecoveredOps.get());
recoveryState.getTranslog().incrementRecoveredOperations(localRecoveredOps.get());
} else {
localRecoveredOps.set(recoveryState.getTranslog().totalLocal());
startRecoveryRequestFuture.onResponse((StartRecoveryRequest) request);
}
}
if (action.equals(PeerRecoveryTargetService.Actions.FILE_CHUNK)) {
RetentionLeases retentionLeases = internalCluster().getInstance(IndicesService.class, node).indexServiceSafe(resolveIndex(indexName)).getShard(0).getRetentionLeases();
throw new AssertionError("expect an operation-based recovery:" + "retention leases" + Strings.toString(retentionLeases) + "]");
}
connection.sendRequest(requestId, action, request, options);
});
}
IndexShard shard = internalCluster().getInstance(IndicesService.class, failingNode).getShardOrNull(new ShardId(resolveIndex(indexName), 0));
final long lastSyncedGlobalCheckpoint = shard.getLastSyncedGlobalCheckpoint();
final long localCheckpointOfSafeCommit;
try (Engine.IndexCommitRef safeCommitRef = shard.acquireSafeIndexCommit()) {
localCheckpointOfSafeCommit = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(safeCommitRef.getIndexCommit().getUserData().entrySet()).localCheckpoint;
}
final long maxSeqNo = shard.seqNoStats().getMaxSeqNo();
shard.failShard("test", new IOException("simulated"));
StartRecoveryRequest startRecoveryRequest = startRecoveryRequestFuture.actionGet();
logger.info("--> start recovery request: starting seq_no {}, commit {}", startRecoveryRequest.startingSeqNo(), startRecoveryRequest.metadataSnapshot().getCommitUserData());
SequenceNumbers.CommitInfo commitInfoAfterLocalRecovery = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(startRecoveryRequest.metadataSnapshot().getCommitUserData().entrySet());
assertThat(commitInfoAfterLocalRecovery.localCheckpoint, equalTo(lastSyncedGlobalCheckpoint));
assertThat(commitInfoAfterLocalRecovery.maxSeqNo, equalTo(lastSyncedGlobalCheckpoint));
assertThat(startRecoveryRequest.startingSeqNo(), equalTo(lastSyncedGlobalCheckpoint + 1));
ensureGreen(indexName);
assertThat((long) localRecoveredOps.get(), equalTo(lastSyncedGlobalCheckpoint - localCheckpointOfSafeCommit));
for (RecoveryState recoveryState : client().admin().indices().prepareRecoveries().get().shardRecoveryStates().get(indexName)) {
if (startRecoveryRequest.targetNode().equals(recoveryState.getTargetNode())) {
assertThat("expect an operation-based recovery", recoveryState.getIndex().fileDetails(), empty());
assertThat("total recovered translog operations must include both local and remote recovery", recoveryState.getTranslog().recoveredOperations(), greaterThanOrEqualTo(Math.toIntExact(maxSeqNo - localCheckpointOfSafeCommit)));
}
}
for (String node : nodes) {
MockTransportService transportService = (MockTransportService) internalCluster().getInstance(TransportService.class, node);
transportService.clearAllRules();
}
}
use of org.opensearch.index.seqno.RetentionLeases in project OpenSearch by opensearch-project.
the class EngineConfigFactoryTests method testCreateCodecServiceFromFactory.
public void testCreateCodecServiceFromFactory() {
IndexMetadata meta = IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1).build();
List<EnginePlugin> plugins = Arrays.asList(new BakEnginePlugin());
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", meta.getSettings());
EngineConfigFactory factory = new EngineConfigFactory(plugins, indexSettings);
EngineConfig config = factory.newEngineConfig(null, null, indexSettings, null, null, null, null, null, null, null, null, null, null, TimeValue.timeValueMinutes(5), null, null, null, null, null, () -> new RetentionLeases(0, 0, Collections.emptyList()), null, null);
assertNotNull(config.getCodec());
}
use of org.opensearch.index.seqno.RetentionLeases in project OpenSearch by opensearch-project.
the class InternalEngineTests method testKeepMinRetainedSeqNoByMergePolicy.
public void testKeepMinRetainedSeqNoByMergePolicy() throws IOException {
IOUtils.close(engine, store);
Settings.Builder settings = Settings.builder().put(defaultSettings.getSettings()).put(IndexSettings.INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING.getKey(), randomLongBetween(0, 10));
final IndexMetadata indexMetadata = IndexMetadata.builder(defaultSettings.getIndexMetadata()).settings(settings).build();
final IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(indexMetadata);
final AtomicLong globalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED);
final long primaryTerm = randomLongBetween(1, Long.MAX_VALUE);
final AtomicLong retentionLeasesVersion = new AtomicLong();
final AtomicReference<RetentionLeases> retentionLeasesHolder = new AtomicReference<>(new RetentionLeases(primaryTerm, retentionLeasesVersion.get(), Collections.emptyList()));
final List<Engine.Operation> operations = generateSingleDocHistory(true, randomFrom(VersionType.INTERNAL, VersionType.EXTERNAL), 2, 10, 300, "2");
Randomness.shuffle(operations);
Set<Long> existingSeqNos = new HashSet<>();
store = createStore();
engine = createEngine(config(indexSettings, store, createTempDir(), newMergePolicy(), null, null, globalCheckpoint::get, retentionLeasesHolder::get));
assertThat(engine.getMinRetainedSeqNo(), equalTo(0L));
long lastMinRetainedSeqNo = engine.getMinRetainedSeqNo();
for (Engine.Operation op : operations) {
final Engine.Result result;
if (op instanceof Engine.Index) {
result = engine.index((Engine.Index) op);
} else {
result = engine.delete((Engine.Delete) op);
}
existingSeqNos.add(result.getSeqNo());
if (randomBoolean()) {
// advance persisted local checkpoint
engine.syncTranslog();
assertEquals(engine.getProcessedLocalCheckpoint(), engine.getPersistedLocalCheckpoint());
globalCheckpoint.set(randomLongBetween(globalCheckpoint.get(), engine.getLocalCheckpointTracker().getPersistedCheckpoint()));
}
if (randomBoolean()) {
retentionLeasesVersion.incrementAndGet();
final int length = randomIntBetween(0, 8);
final List<RetentionLease> leases = new ArrayList<>(length);
for (int i = 0; i < length; i++) {
final String id = randomAlphaOfLength(8);
final long retainingSequenceNumber = randomLongBetween(0, Math.max(0, globalCheckpoint.get()));
final long timestamp = randomLongBetween(0L, Long.MAX_VALUE);
final String source = randomAlphaOfLength(8);
leases.add(new RetentionLease(id, retainingSequenceNumber, timestamp, source));
}
retentionLeasesHolder.set(new RetentionLeases(primaryTerm, retentionLeasesVersion.get(), leases));
}
if (rarely()) {
settings.put(IndexSettings.INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING.getKey(), randomLongBetween(0, 10));
indexSettings.updateIndexMetadata(IndexMetadata.builder(defaultSettings.getIndexMetadata()).settings(settings).build());
engine.onSettingsChanged(indexSettings.getTranslogRetentionAge(), indexSettings.getTranslogRetentionSize(), indexSettings.getSoftDeleteRetentionOperations());
}
if (rarely()) {
engine.refresh("test");
}
if (rarely()) {
engine.flush(true, true);
assertThat(Long.parseLong(engine.getLastCommittedSegmentInfos().userData.get(Engine.MIN_RETAINED_SEQNO)), equalTo(engine.getMinRetainedSeqNo()));
}
if (rarely()) {
engine.forceMerge(randomBoolean(), 1, false, false, false, UUIDs.randomBase64UUID());
}
try (Closeable ignored = engine.acquireHistoryRetentionLock()) {
long minRetainSeqNos = engine.getMinRetainedSeqNo();
assertThat(minRetainSeqNos, lessThanOrEqualTo(globalCheckpoint.get() + 1));
Long[] expectedOps = existingSeqNos.stream().filter(seqno -> seqno >= minRetainSeqNos).toArray(Long[]::new);
Set<Long> actualOps = readAllOperationsInLucene(engine, createMapperService("test")).stream().map(Translog.Operation::seqNo).collect(Collectors.toSet());
assertThat(actualOps, containsInAnyOrder(expectedOps));
}
try (Engine.IndexCommitRef commitRef = engine.acquireSafeIndexCommit()) {
IndexCommit safeCommit = commitRef.getIndexCommit();
if (safeCommit.getUserData().containsKey(Engine.MIN_RETAINED_SEQNO)) {
lastMinRetainedSeqNo = Long.parseLong(safeCommit.getUserData().get(Engine.MIN_RETAINED_SEQNO));
}
}
}
if (randomBoolean()) {
engine.close();
} else {
engine.flushAndClose();
}
try (InternalEngine recoveringEngine = new InternalEngine(engine.config())) {
assertThat(recoveringEngine.getMinRetainedSeqNo(), equalTo(lastMinRetainedSeqNo));
}
}
Aggregations