Search in sources :

Example 41 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTest method testSavepointsAreNotSubsumed.

/**
 * Triggers a savepoint and two checkpoints. The second checkpoint completes and subsumes the
 * first checkpoint, but not the first savepoint. Then we trigger another checkpoint and
 * savepoint. The 2nd savepoint completes and subsumes the last checkpoint, but not the first
 * savepoint.
 */
@Test
public void testSavepointsAreNotSubsumed() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    StandaloneCheckpointIDCounter counter = new StandaloneCheckpointIDCounter();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = spy(new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setCheckpointIDCounter(counter).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(1)).setTimer(manuallyTriggeredScheduledExecutor).build());
    String savepointDir = tmpFolder.newFolder().getAbsolutePath();
    // Trigger savepoint and checkpoint
    CompletableFuture<CompletedCheckpoint> savepointFuture1 = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
    manuallyTriggeredScheduledExecutor.triggerAll();
    long savepointId1 = counter.getLast();
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    CompletableFuture<CompletedCheckpoint> checkpointFuture1 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture1);
    CompletableFuture<CompletedCheckpoint> checkpointFuture2 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture2);
    long checkpointId2 = counter.getLast();
    assertEquals(3, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // 2nd checkpoint should subsume the 1st checkpoint, but not the savepoint
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId2), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId2), TASK_MANAGER_LOCATION_INFO);
    // no completed checkpoint before checkpointId2.
    verify(checkpointCoordinator, times(1)).sendAcknowledgeMessages(anyList(), eq(checkpointId2), anyLong(), eq(INVALID_CHECKPOINT_ID));
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertFalse(checkpointCoordinator.getPendingCheckpoints().get(savepointId1).isDisposed());
    assertFalse(savepointFuture1.isDone());
    CompletableFuture<CompletedCheckpoint> checkpointFuture3 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture3);
    assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
    CompletableFuture<CompletedCheckpoint> savepointFuture2 = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
    manuallyTriggeredScheduledExecutor.triggerAll();
    long savepointId2 = counter.getLast();
    FutureUtils.throwIfCompletedExceptionally(savepointFuture2);
    assertEquals(3, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // savepoints should not subsume checkpoints
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, savepointId2), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, savepointId2), TASK_MANAGER_LOCATION_INFO);
    // we do not send notify checkpoint complete for savepoints
    verify(checkpointCoordinator, times(0)).sendAcknowledgeMessages(anyList(), eq(savepointId2), anyLong(), anyLong());
    assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertFalse(checkpointCoordinator.getPendingCheckpoints().get(savepointId1).isDisposed());
    assertFalse(savepointFuture1.isDone());
    assertNotNull(savepointFuture2.get());
    // Ack first savepoint
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, savepointId1), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, savepointId1), TASK_MANAGER_LOCATION_INFO);
    // we do not send notify checkpoint complete for savepoints
    verify(checkpointCoordinator, times(0)).sendAcknowledgeMessages(anyList(), eq(savepointId1), anyLong(), anyLong());
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertNotNull(savepointFuture1.get());
    CompletableFuture<CompletedCheckpoint> checkpointFuture4 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture4);
    long checkpointId4 = counter.getLast();
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId4), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId4), TASK_MANAGER_LOCATION_INFO);
    // checkpoint2 would be subsumed.
    verify(checkpointCoordinator, times(1)).sendAcknowledgeMessages(anyList(), eq(checkpointId4), anyLong(), eq(checkpointId2));
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 42 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testTriggeringFullCheckpoints.

@Test
public void testTriggeringFullCheckpoints() throws Exception {
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    JobVertexID jobVertexID = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
    ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
    // create a savepoint, we can restore from later
    final CompletedCheckpoint savepoint = takeSavepoint(graph, attemptID);
    // restore from a savepoint in NO_CLAIM mode
    final StandaloneCompletedCheckpointStore checkpointStore = new StandaloneCompletedCheckpointStore(1);
    final StandaloneCheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
    CheckpointCoordinator checkpointCoordinator = createCheckpointCoordinator(graph, checkpointStore, checkpointIDCounter);
    checkpointCoordinator.restoreSavepoint(SavepointRestoreSettings.forPath(savepoint.getExternalPointer(), true, RestoreMode.NO_CLAIM), graph.getAllVertices(), this.getClass().getClassLoader());
    // trigger a savepoint before any checkpoint completes
    // next triggered checkpoint should still be a full one
    takeSavepoint(graph, attemptID, checkpointCoordinator, 2);
    checkpointCoordinator.startCheckpointScheduler();
    gateway.resetCount();
    // the checkpoint should be a FULL_CHECKPOINT
    final CompletableFuture<CompletedCheckpoint> checkpoint = checkpointCoordinator.triggerCheckpoint(true);
    manuallyTriggeredScheduledExecutor.triggerAll();
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID, 3), TASK_MANAGER_LOCATION_INFO);
    checkpoint.get();
    assertThat(gateway.getOnlyTriggeredCheckpoint(attemptID).checkpointOptions.getCheckpointType(), is(CheckpointType.FULL_CHECKPOINT));
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 43 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testTriggerCheckpointSnapshotMasterHookFailed.

@Test
public void testTriggerCheckpointSnapshotMasterHookFailed() throws Exception {
    JobVertexID jobVertexID = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
    ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = createCheckpointCoordinator();
    final CompletableFuture<String> masterHookCheckpointFuture = new CompletableFuture<>();
    checkpointCoordinator.addMasterHook(new TestingMasterHook(masterHookCheckpointFuture));
    checkpointCoordinator.startCheckpointScheduler();
    final CompletableFuture<CompletedCheckpoint> onCompletionPromise = triggerPeriodicCheckpoint(checkpointCoordinator);
    // checkpoint trigger will not finish since master hook checkpoint is not finished yet
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertTrue(checkpointCoordinator.isTriggering());
    // continue triggering
    masterHookCheckpointFuture.completeExceptionally(new Exception("by design"));
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertFalse(checkpointCoordinator.isTriggering());
    try {
        onCompletionPromise.get();
        fail("Should not reach here");
    } catch (ExecutionException e) {
        final Optional<CheckpointException> checkpointExceptionOptional = ExceptionUtils.findThrowable(e, CheckpointException.class);
        assertTrue(checkpointExceptionOptional.isPresent());
        assertEquals(CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, checkpointExceptionOptional.get().getCheckpointFailureReason());
    }
    // it doesn't really trigger task manager to do checkpoint
    assertEquals(0, gateway.getTriggeredCheckpoints(attemptID).size());
    assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) Optional(java.util.Optional) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) ExecutionException(java.util.concurrent.ExecutionException) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Example 44 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testTriggerCheckpointRequestQueuedWithFailure.

@Test
public void testTriggerCheckpointRequestQueuedWithFailure() throws Exception {
    JobVertexID jobVertexID = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
    ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointIDCounter(new UnstableCheckpointIDCounter(id -> id == 0)).setTimer(manuallyTriggeredScheduledExecutor).build();
    checkpointCoordinator.startCheckpointScheduler();
    // start a periodic checkpoint first
    final CompletableFuture<CompletedCheckpoint> onCompletionPromise1 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
    assertTrue(checkpointCoordinator.isTriggering());
    assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
    // another trigger before the prior one finished
    final CompletableFuture<CompletedCheckpoint> onCompletionPromise2 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
    // another trigger before the first one finished
    final CompletableFuture<CompletedCheckpoint> onCompletionPromise3 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
    assertTrue(checkpointCoordinator.isTriggering());
    assertEquals(2, checkpointCoordinator.getTriggerRequestQueue().size());
    manuallyTriggeredScheduledExecutor.triggerAll();
    // the first triggered checkpoint fails by design through UnstableCheckpointIDCounter
    assertTrue(onCompletionPromise1.isCompletedExceptionally());
    assertFalse(onCompletionPromise2.isCompletedExceptionally());
    assertFalse(onCompletionPromise3.isCompletedExceptionally());
    assertFalse(checkpointCoordinator.isTriggering());
    assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
    assertEquals(2, gateway.getTriggeredCheckpoints(attemptID).size());
}
Also used : OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ScheduledExecutorServiceAdapter(org.apache.flink.util.concurrent.ScheduledExecutorServiceAdapter) ExceptionUtils(org.apache.flink.util.ExceptionUtils) CompletableFuture(java.util.concurrent.CompletableFuture) JobStatus(org.apache.flink.api.common.JobStatus) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) HashSet(java.util.HashSet) CoreMatchers.instanceOf(org.hamcrest.CoreMatchers.instanceOf) Assert.assertThat(org.junit.Assert.assertThat) RestoreMode(org.apache.flink.runtime.jobgraph.RestoreMode) TestLogger(org.apache.flink.util.TestLogger) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) Assert.fail(org.junit.Assert.fail) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) Nullable(javax.annotation.Nullable) Before(org.junit.Before) Executor(java.util.concurrent.Executor) CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) Predicate(java.util.function.Predicate) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) Executors(java.util.concurrent.Executors) ExecutorUtils(org.apache.flink.util.ExecutorUtils) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) List(java.util.List) Rule(org.junit.Rule) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) Assert.assertFalse(org.junit.Assert.assertFalse) Optional(java.util.Optional) Matchers.is(org.hamcrest.Matchers.is) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) TemporaryFolder(org.junit.rules.TemporaryFolder) Assert.assertEquals(org.junit.Assert.assertEquals) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 45 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class FileCache method createTmpFile.

// ------------------------------------------------------------------------
/**
 * If the file doesn't exists locally, retrieve the file from the blob-service.
 *
 * @param entry The cache entry descriptor (path, executable flag)
 * @param jobID The ID of the job for which the file is copied.
 * @return The handle to the task that copies the file.
 */
public Future<Path> createTmpFile(String name, DistributedCacheEntry entry, JobID jobID, ExecutionAttemptID executionId) throws Exception {
    synchronized (lock) {
        Map<String, Future<Path>> jobEntries = entries.computeIfAbsent(jobID, k -> new HashMap<>());
        // register reference holder
        final Set<ExecutionAttemptID> refHolders = jobRefHolders.computeIfAbsent(jobID, id -> new HashSet<>());
        refHolders.add(executionId);
        Future<Path> fileEntry = jobEntries.get(name);
        if (fileEntry != null) {
            // immediately returns the file
            return fileEntry;
        } else {
            // need to copy the file
            // create the target path
            File tempDirToUse = new File(storageDirectories[nextDirectory++], jobID.toString());
            if (nextDirectory >= storageDirectories.length) {
                nextDirectory = 0;
            }
            // kick off the copying
            Callable<Path> cp;
            if (entry.blobKey != null) {
                cp = new CopyFromBlobProcess(entry, jobID, blobService, new Path(tempDirToUse.getAbsolutePath()));
            } else {
                cp = new CopyFromDFSProcess(entry, new Path(tempDirToUse.getAbsolutePath()));
            }
            FutureTask<Path> copyTask = new FutureTask<>(cp);
            executorService.submit(copyTask);
            // store our entry
            jobEntries.put(name, copyTask);
            return copyTask;
        }
    }
}
Also used : Path(org.apache.flink.core.fs.Path) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) FutureTask(java.util.concurrent.FutureTask) Future(java.util.concurrent.Future) File(java.io.File)

Aggregations

ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)233 Test (org.junit.Test)176 JobID (org.apache.flink.api.common.JobID)111 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)92 Configuration (org.apache.flink.configuration.Configuration)56 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)56 IOException (java.io.IOException)51 CompletableFuture (java.util.concurrent.CompletableFuture)43 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)38 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)38 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)36 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)35 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)35 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)34 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)34 ExecutionException (java.util.concurrent.ExecutionException)29 ArrayList (java.util.ArrayList)27 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)27 IntermediateDataSetID (org.apache.flink.runtime.jobgraph.IntermediateDataSetID)27 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)26