use of org.apache.flink.runtime.executiongraph.ExecutionJobVertex in project flink by apache.
the class BackPressureStatsTrackerITCase method testBackPressuredProducer.
/**
* Tests a simple fake-back pressured task. Back pressure is assumed when
* sampled stack traces are in blocking buffer requests.
*/
@Test
public void testBackPressuredProducer() throws Exception {
new JavaTestKit(testActorSystem) {
{
final FiniteDuration deadline = new FiniteDuration(60, TimeUnit.SECONDS);
// The JobGraph
final JobGraph jobGraph = new JobGraph();
final int parallelism = 4;
final JobVertex task = new JobVertex("Task");
task.setInvokableClass(BackPressuredTask.class);
task.setParallelism(parallelism);
jobGraph.addVertex(task);
ActorGateway jobManger = null;
ActorGateway taskManager = null;
//
// 1) Consume all buffers at first (no buffers for the test task)
//
testBufferPool = networkBufferPool.createBufferPool(1, Integer.MAX_VALUE);
final List<Buffer> buffers = new ArrayList<>();
while (true) {
Buffer buffer = testBufferPool.requestBuffer();
if (buffer != null) {
buffers.add(buffer);
} else {
break;
}
}
try {
jobManger = TestingUtils.createJobManager(testActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new Configuration());
final Configuration config = new Configuration();
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, parallelism);
taskManager = TestingUtils.createTaskManager(testActorSystem, jobManger, config, true, true);
final ActorGateway jm = jobManger;
new Within(deadline) {
@Override
protected void run() {
try {
ActorGateway testActor = new AkkaActorGateway(getTestActor(), null);
// Submit the job and wait until it is running
JobClient.submitJobDetached(jm, config, jobGraph, deadline, ClassLoader.getSystemClassLoader());
jm.tell(new WaitForAllVerticesToBeRunning(jobGraph.getJobID()), testActor);
expectMsgEquals(new AllVerticesRunning(jobGraph.getJobID()));
// Get the ExecutionGraph
jm.tell(new RequestExecutionGraph(jobGraph.getJobID()), testActor);
ExecutionGraphFound executionGraphResponse = expectMsgClass(ExecutionGraphFound.class);
ExecutionGraph executionGraph = (ExecutionGraph) executionGraphResponse.executionGraph();
ExecutionJobVertex vertex = executionGraph.getJobVertex(task.getID());
StackTraceSampleCoordinator coordinator = new StackTraceSampleCoordinator(testActorSystem.dispatcher(), 60000);
// Verify back pressure (clean up interval can be ignored)
BackPressureStatsTracker statsTracker = new BackPressureStatsTracker(coordinator, 100 * 1000, 20, Time.milliseconds(10L));
int numAttempts = 10;
int nextSampleId = 0;
// the buffer.
for (int attempt = 0; attempt < numAttempts; attempt++) {
try {
OperatorBackPressureStats stats = triggerStatsSample(statsTracker, vertex);
assertEquals(nextSampleId + attempt, stats.getSampleId());
assertEquals(parallelism, stats.getNumberOfSubTasks());
assertEquals(1.0, stats.getMaxBackPressureRatio(), 0.0);
for (int i = 0; i < parallelism; i++) {
assertEquals(1.0, stats.getBackPressureRatio(i), 0.0);
}
nextSampleId = stats.getSampleId() + 1;
break;
} catch (Throwable t) {
if (attempt == numAttempts - 1) {
throw t;
} else {
Thread.sleep(500);
}
}
}
//
for (Buffer buf : buffers) {
buf.recycle();
}
// grab them and then immediately release them.
while (testBufferPool.getNumberOfAvailableMemorySegments() < 100) {
Thread.sleep(100);
}
// Verify that no task is back pressured any more.
for (int attempt = 0; attempt < numAttempts; attempt++) {
try {
OperatorBackPressureStats stats = triggerStatsSample(statsTracker, vertex);
assertEquals(nextSampleId + attempt, stats.getSampleId());
assertEquals(parallelism, stats.getNumberOfSubTasks());
// Verify that no task is back pressured
for (int i = 0; i < parallelism; i++) {
assertEquals(0.0, stats.getBackPressureRatio(i), 0.0);
}
break;
} catch (Throwable t) {
if (attempt == numAttempts - 1) {
throw t;
} else {
Thread.sleep(500);
}
}
}
// Shut down
jm.tell(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), testActor);
// Cancel job
jm.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
// Response to removal notification
expectMsgEquals(true);
//
// 3) Trigger stats for archived job
//
statsTracker.invalidateOperatorStatsCache();
assertFalse("Unexpected trigger", statsTracker.triggerStackTraceSample(vertex));
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
};
} finally {
TestingUtils.stopActor(jobManger);
TestingUtils.stopActor(taskManager);
for (Buffer buf : buffers) {
buf.recycle();
}
testBufferPool.lazyDestroy();
}
}
};
}
use of org.apache.flink.runtime.executiongraph.ExecutionJobVertex in project flink by apache.
the class JobVertexBackPressureHandler method handleRequest.
@Override
public String handleRequest(AccessExecutionJobVertex accessJobVertex, Map<String, String> params) throws Exception {
if (accessJobVertex instanceof ArchivedExecutionJobVertex) {
return "";
}
ExecutionJobVertex jobVertex = (ExecutionJobVertex) accessJobVertex;
try (StringWriter writer = new StringWriter();
JsonGenerator gen = JsonFactory.jacksonFactory.createGenerator(writer)) {
gen.writeStartObject();
Option<OperatorBackPressureStats> statsOption = backPressureStatsTracker.getOperatorBackPressureStats(jobVertex);
if (statsOption.isDefined()) {
OperatorBackPressureStats stats = statsOption.get();
// Check whether we need to refresh
if (refreshInterval <= System.currentTimeMillis() - stats.getEndTimestamp()) {
backPressureStatsTracker.triggerStackTraceSample(jobVertex);
gen.writeStringField("status", "deprecated");
} else {
gen.writeStringField("status", "ok");
}
gen.writeStringField("backpressure-level", getBackPressureLevel(stats.getMaxBackPressureRatio()));
gen.writeNumberField("end-timestamp", stats.getEndTimestamp());
// Sub tasks
gen.writeArrayFieldStart("subtasks");
int numSubTasks = stats.getNumberOfSubTasks();
for (int i = 0; i < numSubTasks; i++) {
double ratio = stats.getBackPressureRatio(i);
gen.writeStartObject();
gen.writeNumberField("subtask", i);
gen.writeStringField("backpressure-level", getBackPressureLevel(ratio));
gen.writeNumberField("ratio", ratio);
gen.writeEndObject();
}
gen.writeEndArray();
} else {
backPressureStatsTracker.triggerStackTraceSample(jobVertex);
gen.writeStringField("status", "deprecated");
}
gen.writeEndObject();
gen.close();
return writer.toString();
}
}
use of org.apache.flink.runtime.executiongraph.ExecutionJobVertex in project flink by apache.
the class SavepointLoader method loadAndValidateSavepoint.
/**
* Loads a savepoint back as a {@link CompletedCheckpoint}.
*
* <p>This method verifies that tasks and parallelism still match the savepoint parameters.
*
* @param jobId The JobID of the job to load the savepoint for.
* @param tasks Tasks that will possibly be reset
* @param savepointPath The path of the savepoint to rollback to
* @param classLoader The class loader to resolve serialized classes in legacy savepoint versions.
* @param allowNonRestoredState Allow to skip checkpoint state that cannot be mapped
* to any job vertex in tasks.
*
* @throws IllegalStateException If mismatch between program and savepoint state
* @throws IOException If savepoint store failure
*/
public static CompletedCheckpoint loadAndValidateSavepoint(JobID jobId, Map<JobVertexID, ExecutionJobVertex> tasks, String savepointPath, ClassLoader classLoader, boolean allowNonRestoredState) throws IOException {
// (1) load the savepoint
final Tuple2<Savepoint, StreamStateHandle> savepointAndHandle = SavepointStore.loadSavepointWithHandle(savepointPath, classLoader);
final Savepoint savepoint = savepointAndHandle.f0;
final StreamStateHandle metadataHandle = savepointAndHandle.f1;
final Map<JobVertexID, TaskState> taskStates = new HashMap<>(savepoint.getTaskStates().size());
boolean expandedToLegacyIds = false;
// (2) validate it (parallelism, etc)
for (TaskState taskState : savepoint.getTaskStates()) {
ExecutionJobVertex executionJobVertex = tasks.get(taskState.getJobVertexID());
// for example as generated from older flink versions, to provide backwards compatibility.
if (executionJobVertex == null && !expandedToLegacyIds) {
tasks = ExecutionJobVertex.includeLegacyJobVertexIDs(tasks);
executionJobVertex = tasks.get(taskState.getJobVertexID());
expandedToLegacyIds = true;
LOG.info("Could not find ExecutionJobVertex. Including legacy JobVertexIDs in search.");
}
if (executionJobVertex != null) {
if (executionJobVertex.getMaxParallelism() == taskState.getMaxParallelism() || !executionJobVertex.isMaxParallelismConfigured()) {
taskStates.put(taskState.getJobVertexID(), taskState);
} else {
String msg = String.format("Failed to rollback to savepoint %s. " + "Max parallelism mismatch between savepoint state and new program. " + "Cannot map operator %s with max parallelism %d to new program with " + "max parallelism %d. This indicates that the program has been changed " + "in a non-compatible way after the savepoint.", savepoint, taskState.getJobVertexID(), taskState.getMaxParallelism(), executionJobVertex.getMaxParallelism());
throw new IllegalStateException(msg);
}
} else if (allowNonRestoredState) {
LOG.info("Skipping savepoint state for operator {}.", taskState.getJobVertexID());
} else {
String msg = String.format("Failed to rollback to savepoint %s. " + "Cannot map savepoint state for operator %s to the new program, " + "because the operator is not available in the new program. If " + "you want to allow to skip this, you can set the --allowNonRestoredState " + "option on the CLI.", savepointPath, taskState.getJobVertexID());
throw new IllegalStateException(msg);
}
}
// (3) convert to checkpoint so the system can fall back to it
CheckpointProperties props = CheckpointProperties.forStandardSavepoint();
return new CompletedCheckpoint(jobId, savepoint.getCheckpointId(), 0L, 0L, taskStates, props, metadataHandle, savepointPath);
}
use of org.apache.flink.runtime.executiongraph.ExecutionJobVertex in project flink by apache.
the class JobVertexBackPressureHandlerTest method testResponseStatsAvailable.
/** Tests the response when stats are available */
@Test
public void testResponseStatsAvailable() throws Exception {
ExecutionJobVertex jobVertex = mock(ExecutionJobVertex.class);
BackPressureStatsTracker statsTracker = mock(BackPressureStatsTracker.class);
OperatorBackPressureStats stats = new OperatorBackPressureStats(0, System.currentTimeMillis(), new double[] { 0.31, 0.48, 1.0, 0.0 });
when(statsTracker.getOperatorBackPressureStats(any(ExecutionJobVertex.class))).thenReturn(Option.apply(stats));
JobVertexBackPressureHandler handler = new JobVertexBackPressureHandler(mock(ExecutionGraphHolder.class), statsTracker, 9999);
String response = handler.handleRequest(jobVertex, Collections.<String, String>emptyMap());
ObjectMapper mapper = new ObjectMapper();
JsonNode rootNode = mapper.readTree(response);
// Single element
assertEquals(4, rootNode.size());
// Status
JsonNode status = rootNode.get("status");
assertNotNull(status);
assertEquals("ok", status.textValue());
// Back pressure level
JsonNode backPressureLevel = rootNode.get("backpressure-level");
assertNotNull(backPressureLevel);
assertEquals("high", backPressureLevel.textValue());
// End time stamp
JsonNode endTimeStamp = rootNode.get("end-timestamp");
assertNotNull(endTimeStamp);
assertEquals(stats.getEndTimestamp(), endTimeStamp.longValue());
// Subtasks
JsonNode subTasks = rootNode.get("subtasks");
assertEquals(stats.getNumberOfSubTasks(), subTasks.size());
for (int i = 0; i < subTasks.size(); i++) {
JsonNode subTask = subTasks.get(i);
JsonNode index = subTask.get("subtask");
assertEquals(i, index.intValue());
JsonNode level = subTask.get("backpressure-level");
assertEquals(JobVertexBackPressureHandler.getBackPressureLevel(stats.getBackPressureRatio(i)), level.textValue());
JsonNode ratio = subTask.get("ratio");
assertEquals(stats.getBackPressureRatio(i), ratio.doubleValue(), 0.0);
}
// Verify not triggered
verify(statsTracker, never()).triggerStackTraceSample(any(ExecutionJobVertex.class));
}
use of org.apache.flink.runtime.executiongraph.ExecutionJobVertex in project flink by apache.
the class CheckpointCoordinatorExternalizedCheckpointsTest method testTriggerAndConfirmSimpleExternalizedCheckpoint.
/**
* Triggers multiple externalized checkpoints and verifies that the metadata
* files have been created.
*/
@Test
public void testTriggerAndConfirmSimpleExternalizedCheckpoint() throws Exception {
final JobID jid = new JobID();
final ExternalizedCheckpointSettings externalizedCheckpointSettings = ExternalizedCheckpointSettings.externalizeCheckpoints(false);
final File checkpointDir = tmp.newFolder();
// create some mock Execution vertices that receive the checkpoint trigger messages
final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
ExecutionVertex vertex1 = CheckpointCoordinatorTest.mockExecutionVertex(attemptID1);
ExecutionVertex vertex2 = CheckpointCoordinatorTest.mockExecutionVertex(attemptID2);
Map<JobVertexID, ExecutionJobVertex> jobVertices = new HashMap<>();
jobVertices.put(vertex1.getJobvertexId(), vertex1.getJobVertex());
jobVertices.put(vertex2.getJobvertexId(), vertex2.getJobVertex());
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinator(jid, 600000, 600000, 0, Integer.MAX_VALUE, externalizedCheckpointSettings, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new ExecutionVertex[] { vertex1, vertex2 }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(1), checkpointDir.getAbsolutePath(), Executors.directExecutor());
assertEquals(0, coord.getNumberOfPendingCheckpoints());
assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());
// ---------------
// trigger checkpoint 1
// ---------------
{
final long timestamp1 = System.currentTimeMillis();
coord.triggerCheckpoint(timestamp1, false);
long checkpointId1 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId1));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId1));
CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
verifyExternalizedCheckpoint(latest, jid, checkpointId1, timestamp1);
verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
}
// ---------------
// trigger checkpoint 2
// ---------------
{
final long timestamp2 = System.currentTimeMillis() + 7;
coord.triggerCheckpoint(timestamp2, false);
long checkpointId2 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId2));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId2));
CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
verifyExternalizedCheckpoint(latest, jid, checkpointId2, timestamp2);
verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
}
// ---------------
// trigger checkpoint 3
// ---------------
{
final long timestamp3 = System.currentTimeMillis() + 146;
coord.triggerCheckpoint(timestamp3, false);
long checkpointId3 = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId3));
coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId3));
CompletedCheckpoint latest = coord.getCheckpointStore().getLatestCheckpoint();
verifyExternalizedCheckpoint(latest, jid, checkpointId3, timestamp3);
verifyExternalizedCheckpointRestore(latest, jobVertices, vertex1, vertex2);
}
coord.shutdown(JobStatus.FINISHED);
}
Aggregations