Search in sources :

Example 16 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTest method testCheckpointStatsTrackerRestoreCallback.

/**
	 * Tests that the restore callbacks are called if registered.
	 */
@Test
public void testCheckpointStatsTrackerRestoreCallback() throws Exception {
    ExecutionVertex vertex1 = mockExecutionVertex(new ExecutionAttemptID());
    StandaloneCompletedCheckpointStore store = new StandaloneCompletedCheckpointStore(1);
    store.addCheckpoint(new CompletedCheckpoint(new JobID(), 0, 0, 0, Collections.<JobVertexID, TaskState>emptyMap()));
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinator(new JobID(), 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new StandaloneCheckpointIDCounter(), store, null, Executors.directExecutor());
    CheckpointStatsTracker tracker = mock(CheckpointStatsTracker.class);
    coord.setCheckpointStatsTracker(tracker);
    assertTrue(coord.restoreLatestCheckpointedState(Collections.<JobVertexID, ExecutionJobVertex>emptyMap(), false, true));
    verify(tracker, times(1)).reportRestoredCheckpoint(any(RestoredCheckpointStats.class));
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 17 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTest method testMaxConcurrentAttempsWithSubsumption.

@Test
public void testMaxConcurrentAttempsWithSubsumption() {
    try {
        final int maxConcurrentAttempts = 2;
        final JobID jid = new JobID();
        // create some mock execution vertices and trigger some checkpoint
        final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
        final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
        final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
        ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
        ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
        ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
        CheckpointCoordinator coord = new CheckpointCoordinator(jid, // periodic interval is 10 ms
        10, // timeout is very long (200 s)
        200000, // no extra delay
        0L, // max two concurrent checkpoints
        maxConcurrentAttempts, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { ackVertex }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
        coord.startCheckpointScheduler();
        // after a while, there should be exactly as many checkpoints
        // as concurrently permitted
        long now = System.currentTimeMillis();
        long timeout = now + 60000;
        long minDuration = now + 100;
        do {
            Thread.sleep(20);
        } while ((now = System.currentTimeMillis()) < minDuration || (coord.getNumberOfPendingCheckpoints() < maxConcurrentAttempts && now < timeout));
        // validate that the pending checkpoints are there
        assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
        assertNotNull(coord.getPendingCheckpoints().get(1L));
        assertNotNull(coord.getPendingCheckpoints().get(2L));
        // now we acknowledge the second checkpoint, which should subsume the first checkpoint
        // and allow two more checkpoints to be triggered
        // now, once we acknowledge one checkpoint, it should trigger the next one
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 2L));
        // after a while, there should be the new checkpoints
        final long newTimeout = System.currentTimeMillis() + 60000;
        do {
            Thread.sleep(20);
        } while (coord.getPendingCheckpoints().get(4L) == null && System.currentTimeMillis() < newTimeout);
        // do the final check
        assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
        assertNotNull(coord.getPendingCheckpoints().get(3L));
        assertNotNull(coord.getPendingCheckpoints().get(4L));
        coord.shutdown(JobStatus.FINISHED);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) JobID(org.apache.flink.api.common.JobID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) IOException(java.io.IOException) Test(org.junit.Test)

Example 18 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTest method testStopPeriodicScheduler.

@Test
public void testStopPeriodicScheduler() throws Exception {
    // create some mock Execution vertices that receive the checkpoint trigger messages
    final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
    ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinator(new JobID(), 600000, 600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new ExecutionVertex[] { vertex1 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), null, Executors.directExecutor());
    // Periodic
    CheckpointTriggerResult triggerResult = coord.triggerCheckpoint(System.currentTimeMillis(), CheckpointProperties.forStandardCheckpoint(), null, true);
    assertTrue(triggerResult.isFailure());
    assertEquals(CheckpointDeclineReason.PERIODIC_SCHEDULER_SHUTDOWN, triggerResult.getFailureReason());
    // Not periodic
    triggerResult = coord.triggerCheckpoint(System.currentTimeMillis(), CheckpointProperties.forStandardCheckpoint(), null, false);
    assertFalse(triggerResult.isFailure());
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 19 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTest method testPeriodicTriggering.

@Test
public void testPeriodicTriggering() {
    try {
        final JobID jid = new JobID();
        final long start = System.currentTimeMillis();
        // create some mock execution vertices and trigger some checkpoint
        final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
        final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
        final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
        ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
        ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
        ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
        final AtomicInteger numCalls = new AtomicInteger();
        final Execution execution = triggerVertex.getCurrentExecutionAttempt();
        doAnswer(new Answer<Void>() {

            private long lastId = -1;

            private long lastTs = -1;

            @Override
            public Void answer(InvocationOnMock invocation) throws Throwable {
                long id = (Long) invocation.getArguments()[0];
                long ts = (Long) invocation.getArguments()[1];
                assertTrue(id > lastId);
                assertTrue(ts >= lastTs);
                assertTrue(ts >= start);
                lastId = id;
                lastTs = ts;
                numCalls.incrementAndGet();
                return null;
            }
        }).when(execution).triggerCheckpoint(anyLong(), anyLong(), any(CheckpointOptions.class));
        CheckpointCoordinator coord = new CheckpointCoordinator(jid, // periodic interval is 10 ms
        10, // timeout is very long (200 s)
        200000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { ackVertex }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
        coord.startCheckpointScheduler();
        long timeout = System.currentTimeMillis() + 60000;
        do {
            Thread.sleep(20);
        } while (timeout > System.currentTimeMillis() && numCalls.get() < 5);
        assertTrue(numCalls.get() >= 5);
        coord.stopCheckpointScheduler();
        // for 400 ms, no further calls may come.
        // there may be the case that one trigger was fired and about to
        // acquire the lock, such that after cancelling it will still do
        // the remainder of its work
        int numCallsSoFar = numCalls.get();
        Thread.sleep(400);
        assertTrue(numCallsSoFar == numCalls.get() || numCallsSoFar + 1 == numCalls.get());
        // start another sequence of periodic scheduling
        numCalls.set(0);
        coord.startCheckpointScheduler();
        timeout = System.currentTimeMillis() + 60000;
        do {
            Thread.sleep(20);
        } while (timeout > System.currentTimeMillis() && numCalls.get() < 5);
        assertTrue(numCalls.get() >= 5);
        coord.stopCheckpointScheduler();
        // for 400 ms, no further calls may come
        // there may be the case that one trigger was fired and about to
        // acquire the lock, such that after cancelling it will still do
        // the remainder of its work
        numCallsSoFar = numCalls.get();
        Thread.sleep(400);
        assertTrue(numCallsSoFar == numCalls.get() || numCallsSoFar + 1 == numCalls.get());
        coord.shutdown(JobStatus.FINISHED);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) IOException(java.io.IOException) Execution(org.apache.flink.runtime.executiongraph.Execution) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) InvocationOnMock(org.mockito.invocation.InvocationOnMock) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 20 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinator method triggerCheckpoint.

@VisibleForTesting
CheckpointTriggerResult triggerCheckpoint(long timestamp, CheckpointProperties props, String targetDirectory, boolean isPeriodic) {
    // Sanity check
    if (props.externalizeCheckpoint() && targetDirectory == null) {
        throw new IllegalStateException("No target directory specified to persist checkpoint to.");
    }
    // make some eager pre-checks
    synchronized (lock) {
        // abort if the coordinator has been shutdown in the meantime
        if (shutdown) {
            return new CheckpointTriggerResult(CheckpointDeclineReason.COORDINATOR_SHUTDOWN);
        }
        // Don't allow periodic checkpoint if scheduling has been disabled
        if (isPeriodic && !periodicScheduling) {
            return new CheckpointTriggerResult(CheckpointDeclineReason.PERIODIC_SCHEDULER_SHUTDOWN);
        }
        // these checks are not relevant for savepoints
        if (!props.forceCheckpoint()) {
            // sanity check: there should never be more than one trigger request queued
            if (triggerRequestQueued) {
                LOG.warn("Trying to trigger another checkpoint while one was queued already");
                return new CheckpointTriggerResult(CheckpointDeclineReason.ALREADY_QUEUED);
            }
            // if too many checkpoints are currently in progress, we need to mark that a request is queued
            if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
                triggerRequestQueued = true;
                if (currentPeriodicTrigger != null) {
                    currentPeriodicTrigger.cancel(false);
                    currentPeriodicTrigger = null;
                }
                return new CheckpointTriggerResult(CheckpointDeclineReason.TOO_MANY_CONCURRENT_CHECKPOINTS);
            }
            // make sure the minimum interval between checkpoints has passed
            final long earliestNext = lastCheckpointCompletionNanos + minPauseBetweenCheckpointsNanos;
            final long durationTillNextMillis = (earliestNext - System.nanoTime()) / 1_000_000;
            if (durationTillNextMillis > 0) {
                if (currentPeriodicTrigger != null) {
                    currentPeriodicTrigger.cancel(false);
                    currentPeriodicTrigger = null;
                }
                // Reassign the new trigger to the currentPeriodicTrigger
                currentPeriodicTrigger = timer.scheduleAtFixedRate(new ScheduledTrigger(), durationTillNextMillis, baseInterval, TimeUnit.MILLISECONDS);
                return new CheckpointTriggerResult(CheckpointDeclineReason.MINIMUM_TIME_BETWEEN_CHECKPOINTS);
            }
        }
    }
    // check if all tasks that we need to trigger are running.
    // if not, abort the checkpoint
    Execution[] executions = new Execution[tasksToTrigger.length];
    for (int i = 0; i < tasksToTrigger.length; i++) {
        Execution ee = tasksToTrigger[i].getCurrentExecutionAttempt();
        if (ee != null && ee.getState() == ExecutionState.RUNNING) {
            executions[i] = ee;
        } else {
            LOG.info("Checkpoint triggering task {} is not being executed at the moment. Aborting checkpoint.", tasksToTrigger[i].getSimpleName());
            return new CheckpointTriggerResult(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
        }
    }
    // next, check if all tasks that need to acknowledge the checkpoint are running.
    // if not, abort the checkpoint
    Map<ExecutionAttemptID, ExecutionVertex> ackTasks = new HashMap<>(tasksToWaitFor.length);
    for (ExecutionVertex ev : tasksToWaitFor) {
        Execution ee = ev.getCurrentExecutionAttempt();
        if (ee != null) {
            ackTasks.put(ee.getAttemptId(), ev);
        } else {
            LOG.info("Checkpoint acknowledging task {} is not being executed at the moment. Aborting checkpoint.", ev.getSimpleName());
            return new CheckpointTriggerResult(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
        }
    }
    // we avoid blocking the processing of 'acknowledge/decline' messages during that time.
    synchronized (triggerLock) {
        final long checkpointID;
        try {
            // this must happen outside the coordinator-wide lock, because it communicates
            // with external services (in HA mode) and may block for a while.
            checkpointID = checkpointIdCounter.getAndIncrement();
        } catch (Throwable t) {
            int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet();
            LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
            return new CheckpointTriggerResult(CheckpointDeclineReason.EXCEPTION);
        }
        final PendingCheckpoint checkpoint = new PendingCheckpoint(job, checkpointID, timestamp, ackTasks, props, targetDirectory, executor);
        if (statsTracker != null) {
            PendingCheckpointStats callback = statsTracker.reportPendingCheckpoint(checkpointID, timestamp, props);
            checkpoint.setStatsCallback(callback);
        }
        // schedule the timer that will clean up the expired checkpoints
        final Runnable canceller = new Runnable() {

            @Override
            public void run() {
                synchronized (lock) {
                    // note that checkpoint completion discards the pending checkpoint object
                    if (!checkpoint.isDiscarded()) {
                        LOG.info("Checkpoint " + checkpointID + " expired before completing.");
                        checkpoint.abortExpired();
                        pendingCheckpoints.remove(checkpointID);
                        rememberRecentCheckpointId(checkpointID);
                        triggerQueuedRequests();
                    }
                }
            }
        };
        try {
            // re-acquire the coordinator-wide lock
            synchronized (lock) {
                // that the conditions still hold.
                if (shutdown) {
                    return new CheckpointTriggerResult(CheckpointDeclineReason.COORDINATOR_SHUTDOWN);
                } else if (!props.forceCheckpoint()) {
                    if (triggerRequestQueued) {
                        LOG.warn("Trying to trigger another checkpoint while one was queued already");
                        return new CheckpointTriggerResult(CheckpointDeclineReason.ALREADY_QUEUED);
                    }
                    if (pendingCheckpoints.size() >= maxConcurrentCheckpointAttempts) {
                        triggerRequestQueued = true;
                        if (currentPeriodicTrigger != null) {
                            currentPeriodicTrigger.cancel(false);
                            currentPeriodicTrigger = null;
                        }
                        return new CheckpointTriggerResult(CheckpointDeclineReason.TOO_MANY_CONCURRENT_CHECKPOINTS);
                    }
                    // make sure the minimum interval between checkpoints has passed
                    final long earliestNext = lastCheckpointCompletionNanos + minPauseBetweenCheckpointsNanos;
                    final long durationTillNextMillis = (earliestNext - System.nanoTime()) / 1_000_000;
                    if (durationTillNextMillis > 0) {
                        if (currentPeriodicTrigger != null) {
                            currentPeriodicTrigger.cancel(false);
                            currentPeriodicTrigger = null;
                        }
                        // Reassign the new trigger to the currentPeriodicTrigger
                        currentPeriodicTrigger = timer.scheduleAtFixedRate(new ScheduledTrigger(), durationTillNextMillis, baseInterval, TimeUnit.MILLISECONDS);
                        return new CheckpointTriggerResult(CheckpointDeclineReason.MINIMUM_TIME_BETWEEN_CHECKPOINTS);
                    }
                }
                LOG.info("Triggering checkpoint " + checkpointID + " @ " + timestamp);
                pendingCheckpoints.put(checkpointID, checkpoint);
                ScheduledFuture<?> cancellerHandle = timer.schedule(canceller, checkpointTimeout, TimeUnit.MILLISECONDS);
                if (!checkpoint.setCancellerHandle(cancellerHandle)) {
                    // checkpoint is already disposed!
                    cancellerHandle.cancel(false);
                }
            }
            // end of lock scope
            CheckpointOptions checkpointOptions;
            if (!props.isSavepoint()) {
                checkpointOptions = CheckpointOptions.forFullCheckpoint();
            } else {
                checkpointOptions = CheckpointOptions.forSavepoint(targetDirectory);
            }
            // send the messages to the tasks that trigger their checkpoint
            for (Execution execution : executions) {
                execution.triggerCheckpoint(checkpointID, timestamp, checkpointOptions);
            }
            numUnsuccessfulCheckpointsTriggers.set(0);
            return new CheckpointTriggerResult(checkpoint);
        } catch (Throwable t) {
            // guard the map against concurrent modifications
            synchronized (lock) {
                pendingCheckpoints.remove(checkpointID);
            }
            int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet();
            LOG.warn("Failed to trigger checkpoint (" + numUnsuccessful + " consecutive failed attempts so far)", t);
            if (!checkpoint.isDiscarded()) {
                checkpoint.abortError(new Exception("Failed to trigger checkpoint"));
            }
            return new CheckpointTriggerResult(CheckpointDeclineReason.EXCEPTION);
        }
    }
// end trigger lock
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Execution(org.apache.flink.runtime.executiongraph.Execution) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting)

Aggregations

ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)233 Test (org.junit.Test)176 JobID (org.apache.flink.api.common.JobID)111 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)92 Configuration (org.apache.flink.configuration.Configuration)56 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)56 IOException (java.io.IOException)51 CompletableFuture (java.util.concurrent.CompletableFuture)43 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)38 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)38 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)36 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)35 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)35 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)34 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)34 ExecutionException (java.util.concurrent.ExecutionException)29 ArrayList (java.util.ArrayList)27 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)27 IntermediateDataSetID (org.apache.flink.runtime.jobgraph.IntermediateDataSetID)27 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)26