use of org.apache.flink.runtime.executiongraph.TaskInformation in project flink by apache.
the class TaskDeploymentDescriptorTest method testSerialization.
@Test
public void testSerialization() {
try {
final JobID jobID = new JobID();
final JobVertexID vertexID = new JobVertexID();
final ExecutionAttemptID execId = new ExecutionAttemptID();
final AllocationID allocationId = new AllocationID();
final String jobName = "job name";
final String taskName = "task name";
final int numberOfKeyGroups = 1;
final int indexInSubtaskGroup = 0;
final int currentNumberOfSubtasks = 1;
final int attemptNumber = 0;
final Configuration jobConfiguration = new Configuration();
final Configuration taskConfiguration = new Configuration();
final Class<? extends AbstractInvokable> invokableClass = BatchTask.class;
final List<ResultPartitionDeploymentDescriptor> producedResults = new ArrayList<ResultPartitionDeploymentDescriptor>(0);
final List<InputGateDeploymentDescriptor> inputGates = new ArrayList<InputGateDeploymentDescriptor>(0);
final List<BlobKey> requiredJars = new ArrayList<BlobKey>(0);
final List<URL> requiredClasspaths = new ArrayList<URL>(0);
final SerializedValue<ExecutionConfig> executionConfig = new SerializedValue<>(new ExecutionConfig());
final SerializedValue<JobInformation> serializedJobInformation = new SerializedValue<>(new JobInformation(jobID, jobName, executionConfig, jobConfiguration, requiredJars, requiredClasspaths));
final SerializedValue<TaskInformation> serializedJobVertexInformation = new SerializedValue<>(new TaskInformation(vertexID, taskName, currentNumberOfSubtasks, numberOfKeyGroups, invokableClass.getName(), taskConfiguration));
final int targetSlotNumber = 47;
final TaskStateHandles taskStateHandles = new TaskStateHandles();
final TaskDeploymentDescriptor orig = new TaskDeploymentDescriptor(serializedJobInformation, serializedJobVertexInformation, execId, allocationId, indexInSubtaskGroup, attemptNumber, targetSlotNumber, taskStateHandles, producedResults, inputGates);
final TaskDeploymentDescriptor copy = CommonTestUtils.createCopySerializable(orig);
assertFalse(orig.getSerializedJobInformation() == copy.getSerializedJobInformation());
assertFalse(orig.getSerializedTaskInformation() == copy.getSerializedTaskInformation());
assertFalse(orig.getExecutionAttemptId() == copy.getExecutionAttemptId());
assertFalse(orig.getTaskStateHandles() == copy.getTaskStateHandles());
assertFalse(orig.getProducedPartitions() == copy.getProducedPartitions());
assertFalse(orig.getInputGates() == copy.getInputGates());
assertEquals(orig.getSerializedJobInformation(), copy.getSerializedJobInformation());
assertEquals(orig.getSerializedTaskInformation(), copy.getSerializedTaskInformation());
assertEquals(orig.getExecutionAttemptId(), copy.getExecutionAttemptId());
assertEquals(orig.getAllocationId(), copy.getAllocationId());
assertEquals(orig.getSubtaskIndex(), copy.getSubtaskIndex());
assertEquals(orig.getAttemptNumber(), copy.getAttemptNumber());
assertEquals(orig.getTargetSlotNumber(), copy.getTargetSlotNumber());
assertEquals(orig.getTaskStateHandles(), copy.getTaskStateHandles());
assertEquals(orig.getProducedPartitions(), copy.getProducedPartitions());
assertEquals(orig.getInputGates(), copy.getInputGates());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.executiongraph.TaskInformation in project flink by apache.
the class SavepointITCase method testTriggerSavepointAndResumeWithFileBasedCheckpoints.
/**
* Triggers a savepoint for a job that uses the FsStateBackend. We expect
* that all checkpoint files are written to a new savepoint directory.
*
* <ol>
* <li>Submit job, wait for some progress</li>
* <li>Trigger savepoint and verify that savepoint has been created</li>
* <li>Shut down the cluster, re-submit the job from the savepoint,
* verify that the initial state has been reset, and
* all tasks are running again</li>
* <li>Cancel job, dispose the savepoint, and verify that everything
* has been cleaned up</li>
* </ol>
*/
@Test
public void testTriggerSavepointAndResumeWithFileBasedCheckpoints() throws Exception {
// Config
final int numTaskManagers = 2;
final int numSlotsPerTaskManager = 2;
final int parallelism = numTaskManagers * numSlotsPerTaskManager;
final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
final File testRoot = folder.newFolder();
TestingCluster flink = null;
try {
// Create a test actor system
ActorSystem testActorSystem = AkkaUtils.createDefaultActorSystem();
// Flink configuration
final Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
final File checkpointDir = new File(testRoot, "checkpoints");
final File savepointRootDir = new File(testRoot, "savepoints");
if (!checkpointDir.mkdir() || !savepointRootDir.mkdirs()) {
fail("Test setup failed: failed to create temporary directories.");
}
// Use file based checkpoints
config.setString(CoreOptions.STATE_BACKEND, "filesystem");
config.setString(FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY, checkpointDir.toURI().toString());
config.setString(FsStateBackendFactory.MEMORY_THRESHOLD_CONF_KEY, "0");
config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointRootDir.toURI().toString());
// Start Flink
flink = new TestingCluster(config);
flink.start(true);
// Submit the job
final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
final JobID jobId = jobGraph.getJobID();
// Reset the static test job helpers
StatefulCounter.resetForTest(parallelism);
// Retrieve the job manager
ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
LOG.info("Submitting job " + jobGraph.getJobID() + " in detached mode.");
flink.submitJobDetached(jobGraph);
LOG.info("Waiting for some progress.");
// wait for the JobManager to be ready
Future<Object> allRunning = jobManager.ask(new WaitForAllVerticesToBeRunning(jobId), deadline.timeLeft());
Await.ready(allRunning, deadline.timeLeft());
// wait for the Tasks to be ready
StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
LOG.info("Triggering a savepoint.");
Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobId, Option.<String>empty()), deadline.timeLeft());
final String savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
LOG.info("Retrieved savepoint path: " + savepointPath + ".");
// Retrieve the savepoint from the testing job manager
LOG.info("Requesting the savepoint.");
Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
SavepointV1 savepoint = (SavepointV1) ((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();
LOG.info("Retrieved savepoint: " + savepointPath + ".");
// Shut down the Flink cluster (thereby canceling the job)
LOG.info("Shutting down Flink cluster.");
flink.shutdown();
flink.awaitTermination();
// - Verification START -------------------------------------------
// Only one savepoint should exist
File[] files = savepointRootDir.listFiles();
if (files != null) {
assertEquals("Savepoint not created in expected directory", 1, files.length);
assertTrue("Savepoint did not create self-contained directory", files[0].isDirectory());
File savepointDir = files[0];
File[] savepointFiles = savepointDir.listFiles();
assertNotNull(savepointFiles);
// Expect one metadata file and one checkpoint file per stateful
// parallel subtask
String errMsg = "Did not write expected number of savepoint/checkpoint files to directory: " + Arrays.toString(savepointFiles);
assertEquals(errMsg, 1 + parallelism, savepointFiles.length);
} else {
fail("Savepoint not created in expected directory");
}
// We currently have the following directory layout: checkpointDir/jobId/chk-ID
File jobCheckpoints = new File(checkpointDir, jobId.toString());
if (jobCheckpoints.exists()) {
files = jobCheckpoints.listFiles();
assertNotNull("Checkpoint directory empty", files);
assertEquals("Checkpoints directory not clean: " + Arrays.toString(files), 0, files.length);
}
// - Verification END ---------------------------------------------
// Restart the cluster
LOG.info("Restarting Flink cluster.");
flink.start();
// Retrieve the job manager
LOG.info("Retrieving JobManager.");
jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
LOG.info("JobManager: " + jobManager + ".");
// Reset static test helpers
StatefulCounter.resetForTest(parallelism);
// Gather all task deployment descriptors
final Throwable[] error = new Throwable[1];
final TestingCluster finalFlink = flink;
final Multimap<JobVertexID, TaskDeploymentDescriptor> tdds = HashMultimap.create();
new JavaTestKit(testActorSystem) {
{
new Within(deadline.timeLeft()) {
@Override
protected void run() {
try {
// Register to all submit task messages for job
for (ActorRef taskManager : finalFlink.getTaskManagersAsJava()) {
taskManager.tell(new TestingTaskManagerMessages.RegisterSubmitTaskListener(jobId), getTestActor());
}
// Set the savepoint path
jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
LOG.info("Resubmitting job " + jobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
// Submit the job
finalFlink.submitJobDetached(jobGraph);
int numTasks = 0;
for (JobVertex jobVertex : jobGraph.getVertices()) {
numTasks += jobVertex.getParallelism();
}
// Gather the task deployment descriptors
LOG.info("Gathering " + numTasks + " submitted " + "TaskDeploymentDescriptor instances.");
for (int i = 0; i < numTasks; i++) {
ResponseSubmitTaskListener resp = (ResponseSubmitTaskListener) expectMsgAnyClassOf(getRemainingTime(), ResponseSubmitTaskListener.class);
TaskDeploymentDescriptor tdd = resp.tdd();
LOG.info("Received: " + tdd.toString() + ".");
TaskInformation taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
tdds.put(taskInformation.getJobVertexId(), tdd);
}
} catch (Throwable t) {
error[0] = t;
}
}
};
}
};
// - Verification START -------------------------------------------
String errMsg = "Error during gathering of TaskDeploymentDescriptors";
assertNull(errMsg, error[0]);
// have a matching task deployment descriptor.
for (TaskState taskState : savepoint.getTaskStates()) {
Collection<TaskDeploymentDescriptor> taskTdds = tdds.get(taskState.getJobVertexID());
errMsg = "Missing task for savepoint state for operator " + taskState.getJobVertexID() + ".";
assertTrue(errMsg, taskTdds.size() > 0);
assertEquals(taskState.getNumberCollectedStates(), taskTdds.size());
for (TaskDeploymentDescriptor tdd : taskTdds) {
SubtaskState subtaskState = taskState.getState(tdd.getSubtaskIndex());
assertNotNull(subtaskState);
errMsg = "Initial operator state mismatch.";
assertEquals(errMsg, subtaskState.getLegacyOperatorState(), tdd.getTaskStateHandles().getLegacyOperatorState());
}
}
// Await state is restored
StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// Await some progress after restore
StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// - Verification END ---------------------------------------------
LOG.info("Cancelling job " + jobId + ".");
jobManager.tell(new CancelJob(jobId));
LOG.info("Disposing savepoint " + savepointPath + ".");
Future<Object> disposeFuture = jobManager.ask(new DisposeSavepoint(savepointPath), deadline.timeLeft());
errMsg = "Failed to dispose savepoint " + savepointPath + ".";
Object resp = Await.result(disposeFuture, deadline.timeLeft());
assertTrue(errMsg, resp.getClass() == getDisposeSavepointSuccess().getClass());
// - Verification START -------------------------------------------
// The checkpoint files
List<File> checkpointFiles = new ArrayList<>();
for (TaskState stateForTaskGroup : savepoint.getTaskStates()) {
for (SubtaskState subtaskState : stateForTaskGroup.getStates()) {
ChainedStateHandle<StreamStateHandle> streamTaskState = subtaskState.getLegacyOperatorState();
for (int i = 0; i < streamTaskState.getLength(); i++) {
if (streamTaskState.get(i) != null) {
FileStateHandle fileStateHandle = (FileStateHandle) streamTaskState.get(i);
checkpointFiles.add(new File(fileStateHandle.getFilePath().toUri()));
}
}
}
}
// The checkpoint files of the savepoint should have been discarded
for (File f : checkpointFiles) {
errMsg = "Checkpoint file " + f + " not cleaned up properly.";
assertFalse(errMsg, f.exists());
}
if (checkpointFiles.size() > 0) {
File parent = checkpointFiles.get(0).getParentFile();
errMsg = "Checkpoint parent directory " + parent + " not cleaned up properly.";
assertFalse(errMsg, parent.exists());
}
// All savepoints should have been cleaned up
errMsg = "Savepoints directory not cleaned up properly: " + Arrays.toString(savepointRootDir.listFiles()) + ".";
assertEquals(errMsg, 0, savepointRootDir.listFiles().length);
// - Verification END ---------------------------------------------
} finally {
if (flink != null) {
flink.shutdown();
}
}
}
use of org.apache.flink.runtime.executiongraph.TaskInformation in project flink by apache.
the class TaskExecutorTest method testTaskSubmission.
/**
* Tests that we can submit a task to the TaskManager given that we've allocated a slot there.
*/
@Test(timeout = 1000L)
public void testTaskSubmission() throws Exception {
final Configuration configuration = new Configuration();
final TestingSerialRpcService rpc = new TestingSerialRpcService();
final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
final JobID jobId = new JobID();
final AllocationID allocationId = new AllocationID();
final UUID jobManagerLeaderId = UUID.randomUUID();
final JobVertexID jobVertexId = new JobVertexID();
JobInformation jobInformation = new JobInformation(jobId, name.getMethodName(), new SerializedValue<>(new ExecutionConfig()), new Configuration(), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList());
TaskInformation taskInformation = new TaskInformation(jobVertexId, "test task", 1, 1, TestInvokable.class.getName(), new Configuration());
SerializedValue<JobInformation> serializedJobInformation = new SerializedValue<>(jobInformation);
SerializedValue<TaskInformation> serializedJobVertexInformation = new SerializedValue<>(taskInformation);
final TaskDeploymentDescriptor tdd = new TaskDeploymentDescriptor(serializedJobInformation, serializedJobVertexInformation, new ExecutionAttemptID(), allocationId, 0, 0, 0, null, Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList());
final LibraryCacheManager libraryCacheManager = mock(LibraryCacheManager.class);
when(libraryCacheManager.getClassLoader(eq(jobId))).thenReturn(getClass().getClassLoader());
final JobManagerConnection jobManagerConnection = new JobManagerConnection(jobId, ResourceID.generate(), mock(JobMasterGateway.class), jobManagerLeaderId, mock(TaskManagerActions.class), mock(CheckpointResponder.class), libraryCacheManager, mock(ResultPartitionConsumableNotifier.class), mock(PartitionProducerStateChecker.class));
final JobManagerTable jobManagerTable = new JobManagerTable();
jobManagerTable.put(jobId, jobManagerConnection);
final TaskSlotTable taskSlotTable = mock(TaskSlotTable.class);
when(taskSlotTable.existsActiveSlot(eq(jobId), eq(allocationId))).thenReturn(true);
when(taskSlotTable.addTask(any(Task.class))).thenReturn(true);
final NetworkEnvironment networkEnvironment = mock(NetworkEnvironment.class);
when(networkEnvironment.createKvStateTaskRegistry(eq(jobId), eq(jobVertexId))).thenReturn(mock(TaskKvStateRegistry.class));
final TaskManagerMetricGroup taskManagerMetricGroup = mock(TaskManagerMetricGroup.class);
when(taskManagerMetricGroup.addTaskForJob(any(JobID.class), anyString(), any(JobVertexID.class), any(ExecutionAttemptID.class), anyString(), anyInt(), anyInt())).thenReturn(mock(TaskMetricGroup.class));
final HighAvailabilityServices haServices = mock(HighAvailabilityServices.class);
when(haServices.getResourceManagerLeaderRetriever()).thenReturn(mock(LeaderRetrievalService.class));
try {
final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
TaskExecutor taskManager = new TaskExecutor(taskManagerConfiguration, mock(TaskManagerLocation.class), rpc, mock(MemoryManager.class), mock(IOManager.class), networkEnvironment, haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), taskManagerMetricGroup, mock(BroadcastVariableManager.class), mock(FileCache.class), taskSlotTable, jobManagerTable, mock(JobLeaderService.class), testingFatalErrorHandler);
taskManager.start();
taskManager.submitTask(tdd, jobManagerLeaderId);
Future<Boolean> completionFuture = TestInvokable.completableFuture;
completionFuture.get();
// check if a concurrent error occurred
testingFatalErrorHandler.rethrowError();
} finally {
rpc.stopService();
}
}
use of org.apache.flink.runtime.executiongraph.TaskInformation in project flink by apache.
the class TaskTest method createTask.
private Task createTask(Class<? extends AbstractInvokable> invokable, LibraryCacheManager libCache, NetworkEnvironment networkEnvironment, ResultPartitionConsumableNotifier consumableNotifier, PartitionProducerStateChecker partitionProducerStateChecker, Executor executor, Configuration taskManagerConfig, ExecutionConfig execConfig) throws IOException {
JobID jobId = new JobID();
JobVertexID jobVertexId = new JobVertexID();
ExecutionAttemptID executionAttemptId = new ExecutionAttemptID();
InputSplitProvider inputSplitProvider = new TaskInputSplitProvider(jobManagerGateway, jobId, jobVertexId, executionAttemptId, new FiniteDuration(60, TimeUnit.SECONDS));
CheckpointResponder checkpointResponder = new ActorGatewayCheckpointResponder(jobManagerGateway);
SerializedValue<ExecutionConfig> serializedExecutionConfig = new SerializedValue<>(execConfig);
JobInformation jobInformation = new JobInformation(jobId, "Test Job", serializedExecutionConfig, new Configuration(), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList());
TaskInformation taskInformation = new TaskInformation(jobVertexId, "Test Task", 1, 1, invokable.getName(), new Configuration());
return new Task(jobInformation, taskInformation, executionAttemptId, new AllocationID(), 0, 0, Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.<InputGateDeploymentDescriptor>emptyList(), 0, null, mock(MemoryManager.class), mock(IOManager.class), networkEnvironment, mock(BroadcastVariableManager.class), taskManagerConnection, inputSplitProvider, checkpointResponder, libCache, mock(FileCache.class), new TestingTaskManagerRuntimeInfo(taskManagerConfig), mock(TaskMetricGroup.class), consumableNotifier, partitionProducerStateChecker, executor);
}
use of org.apache.flink.runtime.executiongraph.TaskInformation in project flink by apache.
the class TaskExecutor method submitTask.
// ======================================================================
// RPC methods
// ======================================================================
// ----------------------------------------------------------------------
// Task lifecycle RPCs
// ----------------------------------------------------------------------
@RpcMethod
public Acknowledge submitTask(TaskDeploymentDescriptor tdd, UUID jobManagerLeaderId) throws TaskSubmissionException {
// first, deserialize the pre-serialized information
final JobInformation jobInformation;
final TaskInformation taskInformation;
try {
jobInformation = tdd.getSerializedJobInformation().deserializeValue(getClass().getClassLoader());
taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
} catch (IOException | ClassNotFoundException e) {
throw new TaskSubmissionException("Could not deserialize the job or task information.", e);
}
final JobID jobId = jobInformation.getJobId();
final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
if (jobManagerConnection == null) {
final String message = "Could not submit task because there is no JobManager " + "associated for the job " + jobId + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
if (!jobManagerConnection.getLeaderId().equals(jobManagerLeaderId)) {
final String message = "Rejecting the task submission because the job manager leader id " + jobManagerLeaderId + " does not match the expected job manager leader id " + jobManagerConnection.getLeaderId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
if (!taskSlotTable.existsActiveSlot(jobId, tdd.getAllocationId())) {
final String message = "No task slot allocated for job ID " + jobId + " and allocation ID " + tdd.getAllocationId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
TaskMetricGroup taskMetricGroup = taskManagerMetricGroup.addTaskForJob(jobInformation.getJobId(), jobInformation.getJobName(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskInformation.getTaskName(), tdd.getSubtaskIndex(), tdd.getAttemptNumber());
InputSplitProvider inputSplitProvider = new RpcInputSplitProvider(jobManagerConnection.getLeaderId(), jobManagerConnection.getJobManagerGateway(), jobInformation.getJobId(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskManagerConfiguration.getTimeout());
TaskManagerActions taskManagerActions = jobManagerConnection.getTaskManagerActions();
CheckpointResponder checkpointResponder = jobManagerConnection.getCheckpointResponder();
LibraryCacheManager libraryCache = jobManagerConnection.getLibraryCacheManager();
ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = jobManagerConnection.getResultPartitionConsumableNotifier();
PartitionProducerStateChecker partitionStateChecker = jobManagerConnection.getPartitionStateChecker();
Task task = new Task(jobInformation, taskInformation, tdd.getExecutionAttemptId(), tdd.getAllocationId(), tdd.getSubtaskIndex(), tdd.getAttemptNumber(), tdd.getProducedPartitions(), tdd.getInputGates(), tdd.getTargetSlotNumber(), tdd.getTaskStateHandles(), memoryManager, ioManager, networkEnvironment, broadcastVariableManager, taskManagerActions, inputSplitProvider, checkpointResponder, libraryCache, fileCache, taskManagerConfiguration, taskMetricGroup, resultPartitionConsumableNotifier, partitionStateChecker, getRpcService().getExecutor());
log.info("Received task {}.", task.getTaskInfo().getTaskNameWithSubtasks());
boolean taskAdded;
try {
taskAdded = taskSlotTable.addTask(task);
} catch (SlotNotFoundException | SlotNotActiveException e) {
throw new TaskSubmissionException("Could not submit task.", e);
}
if (taskAdded) {
task.startTaskThread();
return Acknowledge.get();
} else {
final String message = "TaskManager already contains a task for id " + task.getExecutionId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
}
Aggregations