Search in sources :

Example 1 with ExecutionJobVertex

use of org.apache.flink.runtime.executiongraph.ExecutionJobVertex in project flink by apache.

the class BackPressureStatsTrackerITCase method testBackPressuredProducer.

/**
	 * Tests a simple fake-back pressured task. Back pressure is assumed when
	 * sampled stack traces are in blocking buffer requests.
	 */
@Test
public void testBackPressuredProducer() throws Exception {
    new JavaTestKit(testActorSystem) {

        {
            final FiniteDuration deadline = new FiniteDuration(60, TimeUnit.SECONDS);
            // The JobGraph
            final JobGraph jobGraph = new JobGraph();
            final int parallelism = 4;
            final JobVertex task = new JobVertex("Task");
            task.setInvokableClass(BackPressuredTask.class);
            task.setParallelism(parallelism);
            jobGraph.addVertex(task);
            ActorGateway jobManger = null;
            ActorGateway taskManager = null;
            //
            // 1) Consume all buffers at first (no buffers for the test task)
            //
            testBufferPool = networkBufferPool.createBufferPool(1, Integer.MAX_VALUE);
            final List<Buffer> buffers = new ArrayList<>();
            while (true) {
                Buffer buffer = testBufferPool.requestBuffer();
                if (buffer != null) {
                    buffers.add(buffer);
                } else {
                    break;
                }
            }
            try {
                jobManger = TestingUtils.createJobManager(testActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new Configuration());
                final Configuration config = new Configuration();
                config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, parallelism);
                taskManager = TestingUtils.createTaskManager(testActorSystem, jobManger, config, true, true);
                final ActorGateway jm = jobManger;
                new Within(deadline) {

                    @Override
                    protected void run() {
                        try {
                            ActorGateway testActor = new AkkaActorGateway(getTestActor(), null);
                            // Submit the job and wait until it is running
                            JobClient.submitJobDetached(jm, config, jobGraph, deadline, ClassLoader.getSystemClassLoader());
                            jm.tell(new WaitForAllVerticesToBeRunning(jobGraph.getJobID()), testActor);
                            expectMsgEquals(new AllVerticesRunning(jobGraph.getJobID()));
                            // Get the ExecutionGraph
                            jm.tell(new RequestExecutionGraph(jobGraph.getJobID()), testActor);
                            ExecutionGraphFound executionGraphResponse = expectMsgClass(ExecutionGraphFound.class);
                            ExecutionGraph executionGraph = (ExecutionGraph) executionGraphResponse.executionGraph();
                            ExecutionJobVertex vertex = executionGraph.getJobVertex(task.getID());
                            StackTraceSampleCoordinator coordinator = new StackTraceSampleCoordinator(testActorSystem.dispatcher(), 60000);
                            // Verify back pressure (clean up interval can be ignored)
                            BackPressureStatsTracker statsTracker = new BackPressureStatsTracker(coordinator, 100 * 1000, 20, Time.milliseconds(10L));
                            int numAttempts = 10;
                            int nextSampleId = 0;
                            // the buffer.
                            for (int attempt = 0; attempt < numAttempts; attempt++) {
                                try {
                                    OperatorBackPressureStats stats = triggerStatsSample(statsTracker, vertex);
                                    assertEquals(nextSampleId + attempt, stats.getSampleId());
                                    assertEquals(parallelism, stats.getNumberOfSubTasks());
                                    assertEquals(1.0, stats.getMaxBackPressureRatio(), 0.0);
                                    for (int i = 0; i < parallelism; i++) {
                                        assertEquals(1.0, stats.getBackPressureRatio(i), 0.0);
                                    }
                                    nextSampleId = stats.getSampleId() + 1;
                                    break;
                                } catch (Throwable t) {
                                    if (attempt == numAttempts - 1) {
                                        throw t;
                                    } else {
                                        Thread.sleep(500);
                                    }
                                }
                            }
                            //
                            for (Buffer buf : buffers) {
                                buf.recycle();
                            }
                            // grab them and then immediately release them.
                            while (testBufferPool.getNumberOfAvailableMemorySegments() < 100) {
                                Thread.sleep(100);
                            }
                            // Verify that no task is back pressured any more.
                            for (int attempt = 0; attempt < numAttempts; attempt++) {
                                try {
                                    OperatorBackPressureStats stats = triggerStatsSample(statsTracker, vertex);
                                    assertEquals(nextSampleId + attempt, stats.getSampleId());
                                    assertEquals(parallelism, stats.getNumberOfSubTasks());
                                    // Verify that no task is back pressured
                                    for (int i = 0; i < parallelism; i++) {
                                        assertEquals(0.0, stats.getBackPressureRatio(i), 0.0);
                                    }
                                    break;
                                } catch (Throwable t) {
                                    if (attempt == numAttempts - 1) {
                                        throw t;
                                    } else {
                                        Thread.sleep(500);
                                    }
                                }
                            }
                            // Shut down
                            jm.tell(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), testActor);
                            // Cancel job
                            jm.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
                            // Response to removal notification
                            expectMsgEquals(true);
                            //
                            // 3) Trigger stats for archived job
                            //
                            statsTracker.invalidateOperatorStatsCache();
                            assertFalse("Unexpected trigger", statsTracker.triggerStackTraceSample(vertex));
                        } catch (Exception e) {
                            e.printStackTrace();
                            fail(e.getMessage());
                        }
                    }
                };
            } finally {
                TestingUtils.stopActor(jobManger);
                TestingUtils.stopActor(taskManager);
                for (Buffer buf : buffers) {
                    buf.recycle();
                }
                testBufferPool.lazyDestroy();
            }
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ArrayList(java.util.ArrayList) AllVerticesRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.AllVerticesRunning) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) RequestExecutionGraph(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestExecutionGraph) ExecutionGraphFound(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ExecutionGraphFound) Buffer(org.apache.flink.runtime.io.network.buffer.Buffer) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) RequestExecutionGraph(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestExecutionGraph) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 2 with ExecutionJobVertex

use of org.apache.flink.runtime.executiongraph.ExecutionJobVertex in project flink by apache.

the class JobVertexBackPressureHandler method handleRequest.

@Override
public String handleRequest(AccessExecutionJobVertex accessJobVertex, Map<String, String> params) throws Exception {
    if (accessJobVertex instanceof ArchivedExecutionJobVertex) {
        return "";
    }
    ExecutionJobVertex jobVertex = (ExecutionJobVertex) accessJobVertex;
    try (StringWriter writer = new StringWriter();
        JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer)) {
        gen.writeStartObject();
        Option<OperatorBackPressureStats> statsOption = backPressureStatsTracker.getOperatorBackPressureStats(jobVertex);
        if (statsOption.isDefined()) {
            OperatorBackPressureStats stats = statsOption.get();
            // Check whether we need to refresh
            if (refreshInterval <= System.currentTimeMillis() - stats.getEndTimestamp()) {
                backPressureStatsTracker.triggerStackTraceSample(jobVertex);
                gen.writeStringField("status", "deprecated");
            } else {
                gen.writeStringField("status", "ok");
            }
            gen.writeStringField("backpressure-level", getBackPressureLevel(stats.getMaxBackPressureRatio()));
            gen.writeNumberField("end-timestamp", stats.getEndTimestamp());
            // Sub tasks
            gen.writeArrayFieldStart("subtasks");
            int numSubTasks = stats.getNumberOfSubTasks();
            for (int i = 0; i < numSubTasks; i++) {
                double ratio = stats.getBackPressureRatio(i);
                gen.writeStartObject();
                gen.writeNumberField("subtask", i);
                gen.writeStringField("backpressure-level", getBackPressureLevel(ratio));
                gen.writeNumberField("ratio", ratio);
                gen.writeEndObject();
            }
            gen.writeEndArray();
        } else {
            backPressureStatsTracker.triggerStackTraceSample(jobVertex);
            gen.writeStringField("status", "deprecated");
        }
        gen.writeEndObject();
        gen.close();
        return writer.toString();
    }
}
Also used : ArchivedExecutionJobVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionJobVertex) StringWriter(java.io.StringWriter) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) ArchivedExecutionJobVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionJobVertex) AccessExecutionJobVertex(org.apache.flink.runtime.executiongraph.AccessExecutionJobVertex) OperatorBackPressureStats(org.apache.flink.runtime.webmonitor.OperatorBackPressureStats) JsonGenerator(com.fasterxml.jackson.core.JsonGenerator)

Example 3 with ExecutionJobVertex

use of org.apache.flink.runtime.executiongraph.ExecutionJobVertex in project flink by apache.

the class SavepointLoader method loadAndValidateSavepoint.

/**
	 * Loads a savepoint back as a {@link CompletedCheckpoint}.
	 *
	 * <p>This method verifies that tasks and parallelism still match the savepoint parameters.
	 *
	 * @param jobId          The JobID of the job to load the savepoint for.
	 * @param tasks          Tasks that will possibly be reset
	 * @param savepointPath  The path of the savepoint to rollback to
	 * @param classLoader    The class loader to resolve serialized classes in legacy savepoint versions.
	 * @param allowNonRestoredState Allow to skip checkpoint state that cannot be mapped
	 * to any job vertex in tasks.
	 *
	 * @throws IllegalStateException If mismatch between program and savepoint state
	 * @throws IOException             If savepoint store failure
	 */
public static CompletedCheckpoint loadAndValidateSavepoint(JobID jobId, Map<JobVertexID, ExecutionJobVertex> tasks, String savepointPath, ClassLoader classLoader, boolean allowNonRestoredState) throws IOException {
    // (1) load the savepoint
    final Tuple2<Savepoint, StreamStateHandle> savepointAndHandle = SavepointStore.loadSavepointWithHandle(savepointPath, classLoader);
    final Savepoint savepoint = savepointAndHandle.f0;
    final StreamStateHandle metadataHandle = savepointAndHandle.f1;
    final Map<JobVertexID, TaskState> taskStates = new HashMap<>(savepoint.getTaskStates().size());
    boolean expandedToLegacyIds = false;
    // (2) validate it (parallelism, etc)
    for (TaskState taskState : savepoint.getTaskStates()) {
        ExecutionJobVertex executionJobVertex = tasks.get(taskState.getJobVertexID());
        // for example as generated from older flink versions, to provide backwards compatibility.
        if (executionJobVertex == null && !expandedToLegacyIds) {
            tasks = ExecutionJobVertex.includeLegacyJobVertexIDs(tasks);
            executionJobVertex = tasks.get(taskState.getJobVertexID());
            expandedToLegacyIds = true;
            LOG.info("Could not find ExecutionJobVertex. Including legacy JobVertexIDs in search.");
        }
        if (executionJobVertex != null) {
            if (executionJobVertex.getMaxParallelism() == taskState.getMaxParallelism() || !executionJobVertex.isMaxParallelismConfigured()) {
                taskStates.put(taskState.getJobVertexID(), taskState);
            } else {
                String msg = String.format("Failed to rollback to savepoint %s. " + "Max parallelism mismatch between savepoint state and new program. " + "Cannot map operator %s with max parallelism %d to new program with " + "max parallelism %d. This indicates that the program has been changed " + "in a non-compatible way after the savepoint.", savepoint, taskState.getJobVertexID(), taskState.getMaxParallelism(), executionJobVertex.getMaxParallelism());
                throw new IllegalStateException(msg);
            }
        } else if (allowNonRestoredState) {
            LOG.info("Skipping savepoint state for operator {}.", taskState.getJobVertexID());
        } else {
            String msg = String.format("Failed to rollback to savepoint %s. " + "Cannot map savepoint state for operator %s to the new program, " + "because the operator is not available in the new program. If " + "you want to allow to skip this, you can set the --allowNonRestoredState " + "option on the CLI.", savepointPath, taskState.getJobVertexID());
            throw new IllegalStateException(msg);
        }
    }
    // (3) convert to checkpoint so the system can fall back to it
    CheckpointProperties props = CheckpointProperties.forStandardSavepoint();
    return new CompletedCheckpoint(jobId, savepoint.getCheckpointId(), 0L, 0L, taskStates, props, metadataHandle, savepointPath);
}
Also used : CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) HashMap(java.util.HashMap) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) TaskState(org.apache.flink.runtime.checkpoint.TaskState) CheckpointProperties(org.apache.flink.runtime.checkpoint.CheckpointProperties)

Example 4 with ExecutionJobVertex

use of org.apache.flink.runtime.executiongraph.ExecutionJobVertex in project flink by apache.

the class JobVertexBackPressureHandlerTest method testResponseStatsAvailable.

/** Tests the response when stats are available */
@Test
public void testResponseStatsAvailable() throws Exception {
    ExecutionJobVertex jobVertex = mock(ExecutionJobVertex.class);
    BackPressureStatsTracker statsTracker = mock(BackPressureStatsTracker.class);
    OperatorBackPressureStats stats = new OperatorBackPressureStats(0, System.currentTimeMillis(), new double[] { 0.31, 0.48, 1.0, 0.0 });
    when(statsTracker.getOperatorBackPressureStats(any(ExecutionJobVertex.class))).thenReturn(Option.apply(stats));
    JobVertexBackPressureHandler handler = new JobVertexBackPressureHandler(mock(ExecutionGraphHolder.class), statsTracker, 9999);
    String response = handler.handleRequest(jobVertex, Collections.<String, String>emptyMap());
    ObjectMapper mapper = new ObjectMapper();
    JsonNode rootNode = mapper.readTree(response);
    // Single element
    assertEquals(4, rootNode.size());
    // Status
    JsonNode status = rootNode.get("status");
    assertNotNull(status);
    assertEquals("ok", status.textValue());
    // Back pressure level
    JsonNode backPressureLevel = rootNode.get("backpressure-level");
    assertNotNull(backPressureLevel);
    assertEquals("high", backPressureLevel.textValue());
    // End time stamp
    JsonNode endTimeStamp = rootNode.get("end-timestamp");
    assertNotNull(endTimeStamp);
    assertEquals(stats.getEndTimestamp(), endTimeStamp.longValue());
    // Subtasks
    JsonNode subTasks = rootNode.get("subtasks");
    assertEquals(stats.getNumberOfSubTasks(), subTasks.size());
    for (int i = 0; i < subTasks.size(); i++) {
        JsonNode subTask = subTasks.get(i);
        JsonNode index = subTask.get("subtask");
        assertEquals(i, index.intValue());
        JsonNode level = subTask.get("backpressure-level");
        assertEquals(JobVertexBackPressureHandler.getBackPressureLevel(stats.getBackPressureRatio(i)), level.textValue());
        JsonNode ratio = subTask.get("ratio");
        assertEquals(stats.getBackPressureRatio(i), ratio.doubleValue(), 0.0);
    }
    // Verify not triggered
    verify(statsTracker, never()).triggerStackTraceSample(any(ExecutionJobVertex.class));
}
Also used : ExecutionGraphHolder(org.apache.flink.runtime.webmonitor.ExecutionGraphHolder) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) OperatorBackPressureStats(org.apache.flink.runtime.webmonitor.OperatorBackPressureStats) JsonNode(com.fasterxml.jackson.databind.JsonNode) BackPressureStatsTracker(org.apache.flink.runtime.webmonitor.BackPressureStatsTracker) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Test(org.junit.Test)

Example 5 with ExecutionJobVertex

use of org.apache.flink.runtime.executiongraph.ExecutionJobVertex in project flink by apache.

the class CheckpointCoordinatorExternalizedCheckpointsTest method testTriggerAndConfirmSimpleExternalizedCheckpoint.

/**
	 * Triggers multiple externalized checkpoints and verifies that the metadata
	 * files have been created.
	 */
@Test
public void testTriggerAndConfirmSimpleExternalizedCheckpoint() throws Exception {
    final JobID jid = new JobID();
    final ExternalizedCheckpointSettings externalizedCheckpointSettings = ExternalizedCheckpointSettings.externalizeCheckpoints(false);
    final File checkpointDir = tmp.newFolder();
    // create some mock Execution vertices that receive the checkpoint trigger messages
    final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
    final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
    ExecutionVertex vertex1 = CheckpointCoordinatorTest.mockExecutionVertex(attemptID1);
    ExecutionVertex vertex2 = CheckpointCoordinatorTest.mockExecutionVertex(attemptID2);
    Map<JobVertexID, ExecutionJobVertex> jobVertices = new HashMap<>();
    jobVertices.put(vertex1.getJobvertexId(), vertex1.getJobVertex());
    jobVertices.put(vertex2.getJobvertexId(), vertex2.getJobVertex());
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, externalizedCheckpointSettings, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), checkpointDir.getAbsolutePath(), Executors.directExecutor());
    assertEquals(0, coord.getNumberOfPendingCheckpoints());
    assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
    // ---------------
    // trigger checkpoint 1
    // ---------------
    {
        final long timestamp1 = System.currentTimeMillis();
        coord.triggerCheckpoint(timestamp1, false);
        long checkpointId1 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId1));
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId1));
        CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
        verifyExternalizedCheckpoint(latest, jid, checkpointId1, timestamp1);
        verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
    }
    // ---------------
    // trigger checkpoint 2
    // ---------------
    {
        final long timestamp2 = System.currentTimeMillis() + 7;
        coord.triggerCheckpoint(timestamp2, false);
        long checkpointId2 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId2));
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId2));
        CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
        verifyExternalizedCheckpoint(latest, jid, checkpointId2, timestamp2);
        verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
    }
    // ---------------
    // trigger checkpoint 3
    // ---------------
    {
        final long timestamp3 = System.currentTimeMillis() + 146;
        coord.triggerCheckpoint(timestamp3, false);
        long checkpointId3 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId3));
        coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId3));
        CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
        verifyExternalizedCheckpoint(latest, jid, checkpointId3, timestamp3);
        verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
    }
    coord.shutdown(JobStatus.FINISHED);
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) HashMap(java.util.HashMap) ExternalizedCheckpointSettings(org.apache.flink.runtime.jobgraph.tasks.ExternalizedCheckpointSettings) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) File(java.io.File) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)37 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)25 Test (org.junit.Test)25 JobID (org.apache.flink.api.common.JobID)17 HashMap (java.util.HashMap)12 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)12 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)9 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)8 UnregisteredMetricsGroup (org.apache.flink.metrics.groups.UnregisteredMetricsGroup)7 StreamStateHandle (org.apache.flink.runtime.state.StreamStateHandle)7 ArrayList (java.util.ArrayList)6 JobSnapshottingSettings (org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings)6 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)6 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)5 KeyGroupsStateHandle (org.apache.flink.runtime.state.KeyGroupsStateHandle)5 Configuration (org.apache.flink.configuration.Configuration)4 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)4 ByteStreamStateHandle (org.apache.flink.runtime.state.memory.ByteStreamStateHandle)4 JsonNode (com.fasterxml.jackson.databind.JsonNode)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)3