Search in sources :

Example 6 with MismatchingStoreIdException

use of org.neo4j.kernel.impl.store.MismatchingStoreIdException in project neo4j by neo4j.

the class HighAvailabilityModeSwitcherTest method shouldNotResetAvailableMasterURIIfElectionResultReceived.

@Test
public void shouldNotResetAvailableMasterURIIfElectionResultReceived() throws Throwable {
    /*
         * It is possible that a masterIsElected nulls out the current available master URI in the HAMS. That can
         * be a problem if handing the mIE event is concurrent with an ongoing switch which re-runs because
         * the store was incompatible or a log was missing. In such a case it will find a null master URI on
         * rerun and it will fail.
         */
    // Given
    SwitchToSlaveCopyThenBranch switchToSlave = mock(SwitchToSlaveCopyThenBranch.class);
    // The fist run through switchToSlave
    final CountDownLatch firstCallMade = new CountDownLatch(1);
    // The second run through switchToSlave
    final CountDownLatch secondCallMade = new CountDownLatch(1);
    // The latch for waiting for the masterIsElected to come through
    final CountDownLatch waitForSecondMessage = new CountDownLatch(1);
    HighAvailabilityModeSwitcher toTest = new HighAvailabilityModeSwitcher(switchToSlave, mock(SwitchToMaster.class), mock(Election.class), mock(ClusterMemberAvailability.class), mock(ClusterClient.class), storeSupplierMock(), new InstanceId(1), new ComponentSwitcherContainer(), neoStoreDataSourceSupplierMock(), NullLogService.getInstance());
    URI uri1 = URI.create("ha://server1");
    toTest.init();
    toTest.start();
    toTest.listeningAt(URI.create("ha://server3?serverId=3"));
    when(switchToSlave.switchToSlave(any(LifeSupport.class), any(URI.class), any(URI.class), any(CancellationRequest.class))).thenAnswer(invocation -> {
        firstCallMade.countDown();
        waitForSecondMessage.await();
        throw new MismatchingStoreIdException(StoreId.DEFAULT, StoreId.DEFAULT);
    }).thenAnswer(invocation -> {
        secondCallMade.countDown();
        return URI.create("ha://server3");
    });
    // When
    // The first message goes through, start the first run
    toTest.masterIsAvailable(new HighAvailabilityMemberChangeEvent(PENDING, TO_SLAVE, new InstanceId(1), uri1));
    // Wait for it to be processed but get just before the exception
    firstCallMade.await();
    // It is just about to throw the exception, i.e. rerun. Send in the event
    toTest.masterIsElected(new HighAvailabilityMemberChangeEvent(TO_SLAVE, TO_SLAVE, new InstanceId(1), null));
    // Allow to continue and do the second run
    waitForSecondMessage.countDown();
    // Wait for the call to finish
    secondCallMade.await();
    // Then
    verify(switchToSlave, times(2)).switchToSlave(any(LifeSupport.class), any(URI.class), eq(uri1), any(CancellationRequest.class));
}
Also used : InstanceId(org.neo4j.cluster.InstanceId) StoreId(org.neo4j.kernel.impl.store.StoreId) ScheduledFuture(java.util.concurrent.ScheduledFuture) NeoStoreDataSource(org.neo4j.kernel.NeoStoreDataSource) SwitchToMaster(org.neo4j.kernel.ha.cluster.SwitchToMaster) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) TO_SLAVE(org.neo4j.kernel.ha.cluster.HighAvailabilityMemberState.TO_SLAVE) Callable(java.util.concurrent.Callable) LifeSupport(org.neo4j.kernel.lifecycle.LifeSupport) NullLogProvider(org.neo4j.logging.NullLogProvider) Supplier(java.util.function.Supplier) Mockito.verifyZeroInteractions(org.mockito.Mockito.verifyZeroInteractions) Answer(org.mockito.stubbing.Answer) Mockito.doThrow(org.mockito.Mockito.doThrow) Future(java.util.concurrent.Future) PENDING(org.neo4j.kernel.ha.cluster.HighAvailabilityMemberState.PENDING) AssertableLogProvider(org.neo4j.logging.AssertableLogProvider) Matchers.eq(org.mockito.Matchers.eq) CancellationRequest(org.neo4j.helpers.CancellationRequest) Mockito.doAnswer(org.mockito.Mockito.doAnswer) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) Matchers.anyLong(org.mockito.Matchers.anyLong) URI(java.net.URI) MismatchingStoreIdException(org.neo4j.kernel.impl.store.MismatchingStoreIdException) ExecutorService(java.util.concurrent.ExecutorService) HighAvailabilityMemberState(org.neo4j.kernel.ha.cluster.HighAvailabilityMemberState) InOrder(org.mockito.InOrder) ComException(org.neo4j.com.ComException) SwitchToSlaveCopyThenBranch(org.neo4j.kernel.ha.cluster.SwitchToSlaveCopyThenBranch) Test(org.junit.Test) Mockito.times(org.mockito.Mockito.times) AssertableLogProvider.inLog(org.neo4j.logging.AssertableLogProvider.inLog) Election(org.neo4j.cluster.protocol.election.Election) Mockito.when(org.mockito.Mockito.when) DataSourceManager(org.neo4j.kernel.impl.transaction.state.DataSourceManager) Executors(java.util.concurrent.Executors) Mockito.verify(org.mockito.Mockito.verify) TimeUnit(java.util.concurrent.TimeUnit) Matchers.any(org.mockito.Matchers.any) CountDownLatch(java.util.concurrent.CountDownLatch) NullLogService(org.neo4j.kernel.impl.logging.NullLogService) ClusterMemberAvailability(org.neo4j.cluster.member.ClusterMemberAvailability) Mockito.inOrder(org.mockito.Mockito.inOrder) SimpleLogService(org.neo4j.kernel.impl.logging.SimpleLogService) ClusterClient(org.neo4j.cluster.client.ClusterClient) HighAvailabilityMemberChangeEvent(org.neo4j.kernel.ha.cluster.HighAvailabilityMemberChangeEvent) Mockito.reset(org.mockito.Mockito.reset) Mockito.mock(org.mockito.Mockito.mock) InstanceId(org.neo4j.cluster.InstanceId) ClusterMemberAvailability(org.neo4j.cluster.member.ClusterMemberAvailability) MismatchingStoreIdException(org.neo4j.kernel.impl.store.MismatchingStoreIdException) CountDownLatch(java.util.concurrent.CountDownLatch) Election(org.neo4j.cluster.protocol.election.Election) URI(java.net.URI) ClusterClient(org.neo4j.cluster.client.ClusterClient) SwitchToSlaveCopyThenBranch(org.neo4j.kernel.ha.cluster.SwitchToSlaveCopyThenBranch) HighAvailabilityMemberChangeEvent(org.neo4j.kernel.ha.cluster.HighAvailabilityMemberChangeEvent) LifeSupport(org.neo4j.kernel.lifecycle.LifeSupport) SwitchToMaster(org.neo4j.kernel.ha.cluster.SwitchToMaster) CancellationRequest(org.neo4j.helpers.CancellationRequest) Test(org.junit.Test)

Example 7 with MismatchingStoreIdException

use of org.neo4j.kernel.impl.store.MismatchingStoreIdException in project neo4j by neo4j.

the class SwitchToSlaveBranchThenCopy method checkDataConsistency.

void checkDataConsistency(MasterClient masterClient, TransactionIdStore txIdStore, StoreId storeId, URI masterUri, URI me, CancellationRequest cancellationRequest) throws Throwable {
    try {
        userLog.info("Checking store consistency with master");
        checkMyStoreIdAndMastersStoreId(storeId, masterUri, resolver);
        checkDataConsistencyWithMaster(masterUri, masterClient, storeId, txIdStore);
        userLog.info("Store is consistent");
    } catch (StoreUnableToParticipateInClusterException upe) {
        userLog.info("The store is inconsistent. Will treat it as branched and fetch a new one from the master");
        msgLog.warn("Current store is unable to participate in the cluster; fetching new store from master", upe);
        try {
            stopServicesAndHandleBranchedStore(config.get(HaSettings.branched_data_policy));
        } catch (IOException e) {
            msgLog.warn("Failed while trying to handle branched data", e);
        }
        throw upe;
    } catch (MismatchingStoreIdException e) {
        userLog.info("The store does not represent the same database as master. Will remove and fetch a new one from " + "master");
        if (txIdStore.getLastCommittedTransactionId() == BASE_TX_ID) {
            msgLog.warn("Found and deleting empty store with mismatching store id", e);
            stopServicesAndHandleBranchedStore(BranchedDataPolicy.keep_none);
            throw e;
        }
        msgLog.error("Store cannot participate in cluster due to mismatching store IDs", e);
        throw new ForeignStoreException(e.getExpected(), e.getEncountered());
    }
}
Also used : ForeignStoreException(org.neo4j.kernel.ha.store.ForeignStoreException) StoreUnableToParticipateInClusterException(org.neo4j.kernel.ha.StoreUnableToParticipateInClusterException) MismatchingStoreIdException(org.neo4j.kernel.impl.store.MismatchingStoreIdException) IOException(java.io.IOException)

Example 8 with MismatchingStoreIdException

use of org.neo4j.kernel.impl.store.MismatchingStoreIdException in project neo4j by neo4j.

the class BackupService method incrementalWithContext.

/**
     * Performs an incremental backup based off the given context. This means
     * receiving and applying selectively (i.e. irrespective of the actual state
     * of the target db) a set of transactions starting at the desired txId and
     * spanning up to the latest of the master
     *
     * @param targetDb The database that contains a previous full copy
     * @param context The context, containing transaction id to start streaming transaction from
     * @return A backup context, ready to perform
     */
private BackupOutcome incrementalWithContext(String sourceHostNameOrIp, int sourcePort, GraphDatabaseAPI targetDb, long timeout, RequestContext context) throws IncrementalBackupNotPossibleException {
    DependencyResolver resolver = targetDb.getDependencyResolver();
    ProgressTxHandler handler = new ProgressTxHandler();
    TransactionCommittingResponseUnpacker unpacker = new TransactionCommittingResponseUnpacker(resolver, DEFAULT_BATCH_SIZE, 0);
    Monitors monitors = resolver.resolveDependency(Monitors.class);
    LogProvider logProvider = resolver.resolveDependency(LogService.class).getInternalLogProvider();
    BackupClient client = new BackupClient(sourceHostNameOrIp, sourcePort, null, logProvider, targetDb.storeId(), timeout, unpacker, monitors.newMonitor(ByteCounterMonitor.class, BackupClient.class), monitors.newMonitor(RequestMonitor.class, BackupClient.class), new VersionAwareLogEntryReader<>());
    try (Lifespan lifespan = new Lifespan(unpacker, client)) {
        try (Response<Void> response = client.incrementalBackup(context)) {
            unpacker.unpackResponse(response, handler);
        }
    } catch (MismatchingStoreIdException e) {
        throw new RuntimeException(DIFFERENT_STORE, e);
    } catch (RuntimeException | IOException e) {
        if (e.getCause() != null && e.getCause() instanceof MissingLogDataException) {
            throw new IncrementalBackupNotPossibleException(TOO_OLD_BACKUP, e.getCause());
        }
        if (e.getCause() != null && e.getCause() instanceof ConnectException) {
            throw new RuntimeException(e.getMessage(), e.getCause());
        }
        throw new RuntimeException("Failed to perform incremental backup.", e);
    } catch (Throwable throwable) {
        throw new RuntimeException("Unexpected error", throwable);
    }
    return new BackupOutcome(handler.getLastSeenTransactionId(), true);
}
Also used : MismatchingStoreIdException(org.neo4j.kernel.impl.store.MismatchingStoreIdException) IOException(java.io.IOException) TransactionCommittingResponseUnpacker(org.neo4j.com.storecopy.TransactionCommittingResponseUnpacker) DependencyResolver(org.neo4j.graphdb.DependencyResolver) NullLogProvider(org.neo4j.logging.NullLogProvider) FormattedLogProvider(org.neo4j.logging.FormattedLogProvider) LogProvider(org.neo4j.logging.LogProvider) ByteCounterMonitor(org.neo4j.kernel.monitoring.ByteCounterMonitor) Monitors(org.neo4j.kernel.monitoring.Monitors) Lifespan(org.neo4j.kernel.lifecycle.Lifespan) LogService(org.neo4j.kernel.impl.logging.LogService) StoreLogService(org.neo4j.kernel.impl.logging.StoreLogService) MissingLogDataException(org.neo4j.kernel.impl.transaction.log.MissingLogDataException) RequestMonitor(org.neo4j.com.monitor.RequestMonitor) ConnectException(java.net.ConnectException)

Example 9 with MismatchingStoreIdException

use of org.neo4j.kernel.impl.store.MismatchingStoreIdException in project neo4j by neo4j.

the class HighAvailabilityModeSwitcher method switchToSlave.

private void switchToSlave() {
    /*
         * This is purely defensive and should never trigger. There was a race where the switch to slave task would
         * start after this instance was elected master and the task would constantly try to change as slave
         * for itself, never cancelling. This now should not be possible, since we cancel the task and wait for it
         * to complete, all in a single thread executor. However, this is a check worth doing because if this
         * condition slips through via some other code path it can cause trouble.
         */
    if (getServerId(availableMasterId).equals(instanceId)) {
        msgLog.error("I (" + me + ") tried to switch to slave for myself as master (" + availableMasterId + ")");
        return;
    }
    final AtomicLong wait = new AtomicLong();
    final CancellationHandle cancellationHandle = new CancellationHandle();
    startModeSwitching(new Runnable() {

        @Override
        public void run() {
            if (currentTargetState != HighAvailabilityMemberState.TO_SLAVE) {
                // Already switched - this can happen if a second master becomes available while waiting
                return;
            }
            if (cancellationHandle.cancellationRequested()) {
                msgLog.info("Switch to slave cancelled on start.");
                return;
            }
            componentSwitcher.switchToSlave();
            try {
                if (cancellationHandle.cancellationRequested()) {
                    msgLog.info("Switch to slave cancelled before ha communication started.");
                    return;
                }
                haCommunicationLife.shutdown();
                haCommunicationLife = new LifeSupport();
                // it is important for availableMasterId to be re-read on every attempt so that
                // slave switching would not result in an infinite loop with wrong/stale availableMasterId
                URI resultingSlaveHaURI = switchToSlave.switchToSlave(haCommunicationLife, me, availableMasterId, cancellationHandle);
                if (resultingSlaveHaURI == null) {
                    /*
                         * null slave uri means the task was cancelled. The task then must simply terminate and
                         * have no side effects.
                         */
                    msgLog.info("Switch to slave is effectively cancelled");
                } else {
                    slaveHaURI = resultingSlaveHaURI;
                    canAskForElections.set(true);
                }
            } catch (HighAvailabilityStoreFailureException e) {
                userLog.error("UNABLE TO START UP AS SLAVE: %s", e.getMessage());
                msgLog.error("Unable to start up as slave", e);
                clusterMemberAvailability.memberIsUnavailable(SLAVE);
                ClusterClient clusterClient = HighAvailabilityModeSwitcher.this.clusterClient;
                try {
                    // TODO I doubt this actually works
                    clusterClient.leave();
                    clusterClient.stop();
                    haCommunicationLife.shutdown();
                } catch (Throwable t) {
                    msgLog.error("Unable to stop cluster client", t);
                }
                modeSwitcherExecutor.schedule(this, 5, TimeUnit.SECONDS);
            } catch (MismatchingStoreIdException e) {
                // Try again immediately, the place that threw it have already treated the db
                // as branched and so a new attempt will have this slave copy a new store from master.
                run();
            } catch (Throwable t) {
                msgLog.error("Error while trying to switch to slave", t);
                // Try again later
                // Exponential backoff
                wait.set(1 + wait.get() * 2);
                // Wait maximum 5 minutes
                wait.set(Math.min(wait.get(), 5 * 60));
                modeSwitcherFuture = modeSwitcherExecutor.schedule(this, wait.get(), TimeUnit.SECONDS);
                msgLog.info("Attempting to switch to slave in %ds", wait.get());
            }
        }
    }, cancellationHandle);
}
Also used : AtomicLong(java.util.concurrent.atomic.AtomicLong) ClusterClient(org.neo4j.cluster.client.ClusterClient) HighAvailabilityStoreFailureException(org.neo4j.kernel.ha.store.HighAvailabilityStoreFailureException) LifeSupport(org.neo4j.kernel.lifecycle.LifeSupport) MismatchingStoreIdException(org.neo4j.kernel.impl.store.MismatchingStoreIdException) URI(java.net.URI)

Aggregations

MismatchingStoreIdException (org.neo4j.kernel.impl.store.MismatchingStoreIdException)9 URI (java.net.URI)4 StoreId (org.neo4j.kernel.impl.store.StoreId)4 IOException (java.io.IOException)3 Test (org.junit.Test)3 InstanceId (org.neo4j.cluster.InstanceId)2 ClusterClient (org.neo4j.cluster.client.ClusterClient)2 ComException (org.neo4j.com.ComException)2 LifeSupport (org.neo4j.kernel.lifecycle.LifeSupport)2 NullLogProvider (org.neo4j.logging.NullLogProvider)2 ConnectException (java.net.ConnectException)1 Callable (java.util.concurrent.Callable)1 CountDownLatch (java.util.concurrent.CountDownLatch)1 ExecutorService (java.util.concurrent.ExecutorService)1 Executors (java.util.concurrent.Executors)1 Future (java.util.concurrent.Future)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 ScheduledFuture (java.util.concurrent.ScheduledFuture)1 TimeUnit (java.util.concurrent.TimeUnit)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1