use of org.neo4j.kernel.impl.store.MismatchingStoreIdException in project neo4j by neo4j.
the class HighAvailabilityModeSwitcherTest method shouldNotResetAvailableMasterURIIfElectionResultReceived.
@Test
public void shouldNotResetAvailableMasterURIIfElectionResultReceived() throws Throwable {
/*
* It is possible that a masterIsElected nulls out the current available master URI in the HAMS. That can
* be a problem if handing the mIE event is concurrent with an ongoing switch which re-runs because
* the store was incompatible or a log was missing. In such a case it will find a null master URI on
* rerun and it will fail.
*/
// Given
SwitchToSlaveCopyThenBranch switchToSlave = mock(SwitchToSlaveCopyThenBranch.class);
// The fist run through switchToSlave
final CountDownLatch firstCallMade = new CountDownLatch(1);
// The second run through switchToSlave
final CountDownLatch secondCallMade = new CountDownLatch(1);
// The latch for waiting for the masterIsElected to come through
final CountDownLatch waitForSecondMessage = new CountDownLatch(1);
HighAvailabilityModeSwitcher toTest = new HighAvailabilityModeSwitcher(switchToSlave, mock(SwitchToMaster.class), mock(Election.class), mock(ClusterMemberAvailability.class), mock(ClusterClient.class), storeSupplierMock(), new InstanceId(1), new ComponentSwitcherContainer(), neoStoreDataSourceSupplierMock(), NullLogService.getInstance());
URI uri1 = URI.create("ha://server1");
toTest.init();
toTest.start();
toTest.listeningAt(URI.create("ha://server3?serverId=3"));
when(switchToSlave.switchToSlave(any(LifeSupport.class), any(URI.class), any(URI.class), any(CancellationRequest.class))).thenAnswer(invocation -> {
firstCallMade.countDown();
waitForSecondMessage.await();
throw new MismatchingStoreIdException(StoreId.DEFAULT, StoreId.DEFAULT);
}).thenAnswer(invocation -> {
secondCallMade.countDown();
return URI.create("ha://server3");
});
// When
// The first message goes through, start the first run
toTest.masterIsAvailable(new HighAvailabilityMemberChangeEvent(PENDING, TO_SLAVE, new InstanceId(1), uri1));
// Wait for it to be processed but get just before the exception
firstCallMade.await();
// It is just about to throw the exception, i.e. rerun. Send in the event
toTest.masterIsElected(new HighAvailabilityMemberChangeEvent(TO_SLAVE, TO_SLAVE, new InstanceId(1), null));
// Allow to continue and do the second run
waitForSecondMessage.countDown();
// Wait for the call to finish
secondCallMade.await();
// Then
verify(switchToSlave, times(2)).switchToSlave(any(LifeSupport.class), any(URI.class), eq(uri1), any(CancellationRequest.class));
}
use of org.neo4j.kernel.impl.store.MismatchingStoreIdException in project neo4j by neo4j.
the class SwitchToSlaveBranchThenCopy method checkDataConsistency.
void checkDataConsistency(MasterClient masterClient, TransactionIdStore txIdStore, StoreId storeId, URI masterUri, URI me, CancellationRequest cancellationRequest) throws Throwable {
try {
userLog.info("Checking store consistency with master");
checkMyStoreIdAndMastersStoreId(storeId, masterUri, resolver);
checkDataConsistencyWithMaster(masterUri, masterClient, storeId, txIdStore);
userLog.info("Store is consistent");
} catch (StoreUnableToParticipateInClusterException upe) {
userLog.info("The store is inconsistent. Will treat it as branched and fetch a new one from the master");
msgLog.warn("Current store is unable to participate in the cluster; fetching new store from master", upe);
try {
stopServicesAndHandleBranchedStore(config.get(HaSettings.branched_data_policy));
} catch (IOException e) {
msgLog.warn("Failed while trying to handle branched data", e);
}
throw upe;
} catch (MismatchingStoreIdException e) {
userLog.info("The store does not represent the same database as master. Will remove and fetch a new one from " + "master");
if (txIdStore.getLastCommittedTransactionId() == BASE_TX_ID) {
msgLog.warn("Found and deleting empty store with mismatching store id", e);
stopServicesAndHandleBranchedStore(BranchedDataPolicy.keep_none);
throw e;
}
msgLog.error("Store cannot participate in cluster due to mismatching store IDs", e);
throw new ForeignStoreException(e.getExpected(), e.getEncountered());
}
}
use of org.neo4j.kernel.impl.store.MismatchingStoreIdException in project neo4j by neo4j.
the class BackupService method incrementalWithContext.
/**
* Performs an incremental backup based off the given context. This means
* receiving and applying selectively (i.e. irrespective of the actual state
* of the target db) a set of transactions starting at the desired txId and
* spanning up to the latest of the master
*
* @param targetDb The database that contains a previous full copy
* @param context The context, containing transaction id to start streaming transaction from
* @return A backup context, ready to perform
*/
private BackupOutcome incrementalWithContext(String sourceHostNameOrIp, int sourcePort, GraphDatabaseAPI targetDb, long timeout, RequestContext context) throws IncrementalBackupNotPossibleException {
DependencyResolver resolver = targetDb.getDependencyResolver();
ProgressTxHandler handler = new ProgressTxHandler();
TransactionCommittingResponseUnpacker unpacker = new TransactionCommittingResponseUnpacker(resolver, DEFAULT_BATCH_SIZE, 0);
Monitors monitors = resolver.resolveDependency(Monitors.class);
LogProvider logProvider = resolver.resolveDependency(LogService.class).getInternalLogProvider();
BackupClient client = new BackupClient(sourceHostNameOrIp, sourcePort, null, logProvider, targetDb.storeId(), timeout, unpacker, monitors.newMonitor(ByteCounterMonitor.class, BackupClient.class), monitors.newMonitor(RequestMonitor.class, BackupClient.class), new VersionAwareLogEntryReader<>());
try (Lifespan lifespan = new Lifespan(unpacker, client)) {
try (Response<Void> response = client.incrementalBackup(context)) {
unpacker.unpackResponse(response, handler);
}
} catch (MismatchingStoreIdException e) {
throw new RuntimeException(DIFFERENT_STORE, e);
} catch (RuntimeException | IOException e) {
if (e.getCause() != null && e.getCause() instanceof MissingLogDataException) {
throw new IncrementalBackupNotPossibleException(TOO_OLD_BACKUP, e.getCause());
}
if (e.getCause() != null && e.getCause() instanceof ConnectException) {
throw new RuntimeException(e.getMessage(), e.getCause());
}
throw new RuntimeException("Failed to perform incremental backup.", e);
} catch (Throwable throwable) {
throw new RuntimeException("Unexpected error", throwable);
}
return new BackupOutcome(handler.getLastSeenTransactionId(), true);
}
use of org.neo4j.kernel.impl.store.MismatchingStoreIdException in project neo4j by neo4j.
the class HighAvailabilityModeSwitcher method switchToSlave.
private void switchToSlave() {
/*
* This is purely defensive and should never trigger. There was a race where the switch to slave task would
* start after this instance was elected master and the task would constantly try to change as slave
* for itself, never cancelling. This now should not be possible, since we cancel the task and wait for it
* to complete, all in a single thread executor. However, this is a check worth doing because if this
* condition slips through via some other code path it can cause trouble.
*/
if (getServerId(availableMasterId).equals(instanceId)) {
msgLog.error("I (" + me + ") tried to switch to slave for myself as master (" + availableMasterId + ")");
return;
}
final AtomicLong wait = new AtomicLong();
final CancellationHandle cancellationHandle = new CancellationHandle();
startModeSwitching(new Runnable() {
@Override
public void run() {
if (currentTargetState != HighAvailabilityMemberState.TO_SLAVE) {
// Already switched - this can happen if a second master becomes available while waiting
return;
}
if (cancellationHandle.cancellationRequested()) {
msgLog.info("Switch to slave cancelled on start.");
return;
}
componentSwitcher.switchToSlave();
try {
if (cancellationHandle.cancellationRequested()) {
msgLog.info("Switch to slave cancelled before ha communication started.");
return;
}
haCommunicationLife.shutdown();
haCommunicationLife = new LifeSupport();
// it is important for availableMasterId to be re-read on every attempt so that
// slave switching would not result in an infinite loop with wrong/stale availableMasterId
URI resultingSlaveHaURI = switchToSlave.switchToSlave(haCommunicationLife, me, availableMasterId, cancellationHandle);
if (resultingSlaveHaURI == null) {
/*
* null slave uri means the task was cancelled. The task then must simply terminate and
* have no side effects.
*/
msgLog.info("Switch to slave is effectively cancelled");
} else {
slaveHaURI = resultingSlaveHaURI;
canAskForElections.set(true);
}
} catch (HighAvailabilityStoreFailureException e) {
userLog.error("UNABLE TO START UP AS SLAVE: %s", e.getMessage());
msgLog.error("Unable to start up as slave", e);
clusterMemberAvailability.memberIsUnavailable(SLAVE);
ClusterClient clusterClient = HighAvailabilityModeSwitcher.this.clusterClient;
try {
// TODO I doubt this actually works
clusterClient.leave();
clusterClient.stop();
haCommunicationLife.shutdown();
} catch (Throwable t) {
msgLog.error("Unable to stop cluster client", t);
}
modeSwitcherExecutor.schedule(this, 5, TimeUnit.SECONDS);
} catch (MismatchingStoreIdException e) {
// Try again immediately, the place that threw it have already treated the db
// as branched and so a new attempt will have this slave copy a new store from master.
run();
} catch (Throwable t) {
msgLog.error("Error while trying to switch to slave", t);
// Try again later
// Exponential backoff
wait.set(1 + wait.get() * 2);
// Wait maximum 5 minutes
wait.set(Math.min(wait.get(), 5 * 60));
modeSwitcherFuture = modeSwitcherExecutor.schedule(this, wait.get(), TimeUnit.SECONDS);
msgLog.info("Attempting to switch to slave in %ds", wait.get());
}
}
}, cancellationHandle);
}
Aggregations