use of org.apache.cassandra.service.paxos.Commit in project cassandra by apache.
the class StorageProxy method beginAndRepairPaxos.
/**
* begin a Paxos session by sending a prepare request and completing any in-progress requests seen in the replies
*
* @return the Paxos ballot promised by the replicas if no in-progress requests were seen and a quorum of
* nodes have seen the mostRecentCommit. Otherwise, return null.
*/
private static Pair<UUID, Integer> beginAndRepairPaxos(long queryStartNanoTime, DecoratedKey key, TableMetadata metadata, List<InetAddress> liveEndpoints, int requiredParticipants, ConsistencyLevel consistencyForPaxos, ConsistencyLevel consistencyForCommit, final boolean isWrite, ClientState state) throws WriteTimeoutException, WriteFailureException {
long timeout = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getCasContentionTimeout());
PrepareCallback summary = null;
int contentions = 0;
while (System.nanoTime() - queryStartNanoTime < timeout) {
// We want a timestamp that is guaranteed to be unique for that node (so that the ballot is globally unique), but if we've got a prepare rejected
// already we also want to make sure we pick a timestamp that has a chance to be promised, i.e. one that is greater that the most recently known
// in progress (#5667). Lastly, we don't want to use a timestamp that is older than the last one assigned by ClientState or operations may appear
// out-of-order (#7801).
long minTimestampMicrosToUse = summary == null ? Long.MIN_VALUE : 1 + UUIDGen.microsTimestamp(summary.mostRecentInProgressCommit.ballot);
long ballotMicros = state.getTimestampForPaxos(minTimestampMicrosToUse);
// Note that ballotMicros is not guaranteed to be unique if two proposal are being handled concurrently by the same coordinator. But we still
// need ballots to be unique for each proposal so we have to use getRandomTimeUUIDFromMicros.
UUID ballot = UUIDGen.getRandomTimeUUIDFromMicros(ballotMicros);
// prepare
Tracing.trace("Preparing {}", ballot);
Commit toPrepare = Commit.newPrepare(key, metadata, ballot);
summary = preparePaxos(toPrepare, liveEndpoints, requiredParticipants, consistencyForPaxos, queryStartNanoTime);
if (!summary.promised) {
Tracing.trace("Some replicas have already promised a higher ballot than ours; aborting");
contentions++;
// sleep a random amount to give the other proposer a chance to finish
Uninterruptibles.sleepUninterruptibly(ThreadLocalRandom.current().nextInt(100), TimeUnit.MILLISECONDS);
continue;
}
Commit inProgress = summary.mostRecentInProgressCommitWithUpdate;
Commit mostRecent = summary.mostRecentCommit;
// needs to be completed, so do it.
if (!inProgress.update.isEmpty() && inProgress.isAfter(mostRecent)) {
Tracing.trace("Finishing incomplete paxos round {}", inProgress);
if (isWrite)
casWriteMetrics.unfinishedCommit.inc();
else
casReadMetrics.unfinishedCommit.inc();
Commit refreshedInProgress = Commit.newProposal(ballot, inProgress.update);
if (proposePaxos(refreshedInProgress, liveEndpoints, requiredParticipants, false, consistencyForPaxos, queryStartNanoTime)) {
try {
commitPaxos(refreshedInProgress, consistencyForCommit, false, queryStartNanoTime);
} catch (WriteTimeoutException e) {
recordCasContention(contentions);
// We're still doing preparation for the paxos rounds, so we want to use the CAS (see CASSANDRA-8672)
throw new WriteTimeoutException(WriteType.CAS, e.consistency, e.received, e.blockFor);
}
} else {
Tracing.trace("Some replicas have already promised a higher ballot than ours; aborting");
// sleep a random amount to give the other proposer a chance to finish
contentions++;
Uninterruptibles.sleepUninterruptibly(ThreadLocalRandom.current().nextInt(100), TimeUnit.MILLISECONDS);
}
continue;
}
// To be able to propose our value on a new round, we need a quorum of replica to have learn the previous one. Why is explained at:
// https://issues.apache.org/jira/browse/CASSANDRA-5062?focusedCommentId=13619810&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-13619810)
// Since we waited for quorum nodes, if some of them haven't seen the last commit (which may just be a timing issue, but may also
// mean we lost messages), we pro-actively "repair" those nodes, and retry.
int nowInSec = Ints.checkedCast(TimeUnit.MICROSECONDS.toSeconds(ballotMicros));
Iterable<InetAddress> missingMRC = summary.replicasMissingMostRecentCommit(metadata, nowInSec);
if (Iterables.size(missingMRC) > 0) {
Tracing.trace("Repairing replicas that missed the most recent commit");
sendCommit(mostRecent, missingMRC);
// latter ticket, we can pass CL.ALL to the commit above and remove the 'continue'.
continue;
}
return Pair.create(ballot, contentions);
}
recordCasContention(contentions);
throw new WriteTimeoutException(WriteType.CAS, consistencyForPaxos, 0, consistencyForPaxos.blockFor(Keyspace.open(metadata.keyspace)));
}
use of org.apache.cassandra.service.paxos.Commit in project cassandra by apache.
the class PaxosStateTest method testCommittingAfterTruncation.
@Test
public void testCommittingAfterTruncation() throws Exception {
ColumnFamilyStore cfs = Keyspace.open("PaxosStateTestKeyspace1").getColumnFamilyStore("Standard1");
String key = "key" + System.nanoTime();
ByteBuffer value = ByteBufferUtil.bytes(0);
RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata(), FBUtilities.timestampMicros(), key);
builder.clustering("a").add("val", value);
PartitionUpdate update = Iterables.getOnlyElement(builder.build().getPartitionUpdates());
// CFS should be empty initially
assertNoDataPresent(cfs, Util.dk(key));
// Commit the proposal & verify the data is present
Commit beforeTruncate = newProposal(0, update);
PaxosState.commit(beforeTruncate);
assertDataPresent(cfs, Util.dk(key), "val", value);
// Truncate then attempt to commit again, mutation should
// be ignored as the proposal predates the truncation
cfs.truncateBlocking();
PaxosState.commit(beforeTruncate);
assertNoDataPresent(cfs, Util.dk(key));
// Now try again with a ballot created after the truncation
long timestamp = SystemKeyspace.getTruncatedAt(update.metadata().id) + 1;
Commit afterTruncate = newProposal(timestamp, update);
PaxosState.commit(afterTruncate);
assertDataPresent(cfs, Util.dk(key), "val", value);
}
use of org.apache.cassandra.service.paxos.Commit in project cassandra by apache.
the class Ballots method read.
public static LatestBallots read(DecoratedKey key, TableMetadata metadata, int nowInSec, boolean includeEmptyProposals) {
return NonInterceptible.apply(() -> {
PaxosState state = loadPaxosState(key, metadata, nowInSec);
UUID promised = state.promised.ballot;
Commit accepted = isAfter(state.accepted, state.mostRecentCommit) ? null : state.accepted;
Commit committed = state.mostRecentCommit;
long baseTable = latestBallotFromBaseTable(key, metadata);
return new LatestBallots(UUIDGen.microsTimestamp(promised), accepted == null || accepted.update.isEmpty() ? 0L : latestBallot(accepted.update), latestBallot(committed.update), baseTable);
});
}
use of org.apache.cassandra.service.paxos.Commit in project cassandra by apache.
the class WriteCallbackInfoTest method testShouldHint.
private void testShouldHint(Verb verb, ConsistencyLevel cl, boolean allowHints, boolean expectHint) {
TableMetadata metadata = MockSchema.newTableMetadata("", "");
Object payload = verb == Verb.PAXOS_COMMIT_REQ ? new Commit(UUID.randomUUID(), new PartitionUpdate.Builder(metadata, ByteBufferUtil.EMPTY_BYTE_BUFFER, RegularAndStaticColumns.NONE, 1).build()) : new Mutation(PartitionUpdate.simpleBuilder(metadata, "").build());
RequestCallbacks.WriteCallbackInfo wcbi = new RequestCallbacks.WriteCallbackInfo(Message.out(verb, payload), full(testEp), null, cl, allowHints);
Assert.assertEquals(expectHint, wcbi.shouldHint());
if (expectHint) {
Assert.assertNotNull(wcbi.mutation());
} else {
boolean fail = false;
try {
wcbi.mutation();
} catch (Throwable t) {
fail = true;
}
Assert.assertTrue(fail);
}
}
use of org.apache.cassandra.service.paxos.Commit in project cassandra by apache.
the class ModificationStatement method casInternal.
static RowIterator casInternal(CQL3CasRequest request, QueryState state) {
UUID ballot = UUIDGen.getTimeUUIDFromMicros(state.getTimestamp());
SinglePartitionReadCommand readCommand = request.readCommand(FBUtilities.nowInSeconds());
FilteredPartition current;
try (ReadExecutionController executionController = readCommand.executionController();
PartitionIterator iter = readCommand.executeInternal(executionController)) {
current = FilteredPartition.create(PartitionIterators.getOnlyElement(iter, readCommand));
}
if (!request.appliesTo(current))
return current.rowIterator();
PartitionUpdate updates = request.makeUpdates(current);
updates = TriggerExecutor.instance.execute(updates);
Commit proposal = Commit.newProposal(ballot, updates);
proposal.makeMutation().apply();
return null;
}
Aggregations