use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class SlotProtocolTest method testSlotAvailableRequest.
/**
* Tests whether
* 1) a SlotRequest is routed to the SlotManager
* 2) a SlotRequest is confirmed
* 3) a SlotRequest leads to an allocation of a registered slot
* 4) a SlotRequest is routed to the TaskExecutor
*/
@Test
public void testSlotAvailableRequest() throws Exception {
final String rmAddress = "/rm1";
final String jmAddress = "/jm1";
final String tmAddress = "/tm1";
final JobID jobID = new JobID();
testRpcService.registerGateway(jmAddress, mock(JobMasterGateway.class));
final TestingHighAvailabilityServices testingHaServices = new TestingHighAvailabilityServices();
final UUID rmLeaderID = UUID.randomUUID();
final UUID jmLeaderID = UUID.randomUUID();
TestingLeaderElectionService rmLeaderElectionService = configureHA(testingHaServices, jobID, rmAddress, rmLeaderID, jmAddress, jmLeaderID);
TaskExecutorGateway taskExecutorGateway = mock(TaskExecutorGateway.class);
Mockito.when(taskExecutorGateway.requestSlot(any(SlotID.class), any(JobID.class), any(AllocationID.class), any(String.class), any(UUID.class), any(Time.class))).thenReturn(new FlinkCompletableFuture<TMSlotRequestReply>());
testRpcService.registerGateway(tmAddress, taskExecutorGateway);
ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.seconds(5L), Time.seconds(5L), Time.minutes(5L));
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHaServices, testRpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
TestingSlotManagerFactory slotManagerFactory = new TestingSlotManagerFactory();
ResourceManager<ResourceID> resourceManager = Mockito.spy(new StandaloneResourceManager(testRpcService, resourceManagerConfiguration, testingHaServices, slotManagerFactory, mock(MetricRegistry.class), jobLeaderIdService, mock(FatalErrorHandler.class)));
resourceManager.start();
rmLeaderElectionService.isLeader(rmLeaderID);
Thread.sleep(1000);
Future<RegistrationResponse> registrationFuture = resourceManager.registerJobManager(rmLeaderID, jmLeaderID, jmAddress, jobID);
try {
registrationFuture.get(5L, TimeUnit.SECONDS);
} catch (Exception e) {
Assert.fail("JobManager registration Future didn't become ready.");
}
final SlotManager slotManager = slotManagerFactory.slotManager;
final ResourceID resourceID = ResourceID.generate();
final AllocationID allocationID = new AllocationID();
final ResourceProfile resourceProfile = new ResourceProfile(1.0, 100);
final SlotID slotID = new SlotID(resourceID, 0);
final SlotStatus slotStatus = new SlotStatus(slotID, resourceProfile);
final SlotReport slotReport = new SlotReport(Collections.singletonList(slotStatus));
// register slot at SlotManager
slotManager.registerTaskExecutor(resourceID, new TaskExecutorRegistration(taskExecutorGateway), slotReport);
SlotRequest slotRequest = new SlotRequest(jobID, allocationID, resourceProfile);
RMSlotRequestReply slotRequestReply = resourceManager.requestSlot(jmLeaderID, rmLeaderID, slotRequest);
// 1) a SlotRequest is routed to the SlotManager
verify(slotManager).requestSlot(slotRequest);
// 2) a SlotRequest is confirmed
Assert.assertEquals(slotRequestReply.getAllocationID(), allocationID);
// 3) a SlotRequest leads to an allocation of a registered slot
Assert.assertTrue(slotManager.isAllocated(slotID));
Assert.assertTrue(slotManager.isAllocated(allocationID));
// 4) a SlotRequest is routed to the TaskExecutor
verify(taskExecutorGateway, timeout(5000)).requestSlot(eq(slotID), eq(jobID), eq(allocationID), any(String.class), any(UUID.class), any(Time.class));
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class SlotManagerTest method testMultipleSlotRequestsWithOneSlot.
/**
* Tests multiple slot requests with one slots.
*/
@Test
public void testMultipleSlotRequestsWithOneSlot() {
TestingSlotManager slotManager = new TestingSlotManager();
final AllocationID allocationID = new AllocationID();
SlotRequest request1 = new SlotRequest(new JobID(), allocationID, DEFAULT_TESTING_PROFILE);
slotManager.requestSlot(request1);
final ResourceID resourceID = ResourceID.generate();
final SlotStatus slotStatus = new SlotStatus(new SlotID(resourceID, 0), DEFAULT_TESTING_PROFILE);
final SlotReport slotReport = new SlotReport(slotStatus);
slotManager.registerTaskExecutor(resourceID, taskExecutorRegistration, slotReport);
// another request pending
SlotRequest request2 = new SlotRequest(new JobID(), new AllocationID(), DEFAULT_TESTING_PROFILE);
slotManager.requestSlot(request2);
assertEquals(1, slotManager.getAllocatedSlotCount());
assertEquals(0, slotManager.getFreeSlotCount());
assertEquals(1, slotManager.getPendingRequestCount());
assertTrue(slotManager.isAllocated(allocationID));
assertTrue(slotManager.isAllocated(request1.getAllocationId()));
// but slot is reported empty in a report in the meantime which shouldn't affect the state
slotManager.notifySlotAvailable(resourceID, slotStatus.getSlotID());
assertEquals(1, slotManager.getAllocatedSlotCount());
assertEquals(0, slotManager.getFreeSlotCount());
assertEquals(0, slotManager.getPendingRequestCount());
assertTrue(slotManager.isAllocated(slotStatus.getSlotID()));
assertTrue(slotManager.isAllocated(request2.getAllocationId()));
// but slot is reported empty in a report in the meantime which shouldn't affect the state
slotManager.notifySlotAvailable(resourceID, slotStatus.getSlotID());
assertEquals(0, slotManager.getAllocatedSlotCount());
assertEquals(1, slotManager.getFreeSlotCount());
assertEquals(0, slotManager.getPendingRequestCount());
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.
/**
* This test tests that the restarting time metric correctly displays restarting times.
*/
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
try {
// setup execution graph with mocked scheduling logic
int parallelism = 1;
JobVertex jobVertex = new JobVertex("TestVertex");
jobVertex.setParallelism(parallelism);
jobVertex.setInvokableClass(NoOpInvokable.class);
JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
Configuration config = new Configuration();
config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
Configuration jobConfig = new Configuration();
Time timeout = Time.seconds(10L);
MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
assertTrue(metricRegistry.getReporters().size() == 1);
MetricReporter reporter = metricRegistry.getReporters().get(0);
assertTrue(reporter instanceof TestingReporter);
TestingReporter testingReporter = (TestingReporter) reporter;
MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
Scheduler scheduler = mock(Scheduler.class);
ResourceID taskManagerId = ResourceID.generate();
TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
when(taskManagerLocation.getHostname()).thenReturn("localhost");
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
Instance instance = mock(Instance.class);
when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(instance.getTaskManagerID()).thenReturn(taskManagerId);
when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
Slot rootSlot = mock(Slot.class);
AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
SimpleSlot simpleSlot = mock(SimpleSlot.class);
when(simpleSlot.isAlive()).thenReturn(true);
when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
when(simpleSlot.getRoot()).thenReturn(rootSlot);
when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
future.complete(simpleSlot);
when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
when(rootSlot.getSlotNumber()).thenReturn(0);
when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
// get restarting time metric
Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
assertNotNull(metric);
assertTrue(metric instanceof Gauge);
@SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
// check that the restarting time is 0 since it's the initial start
assertTrue(0L == restartingTime.getValue());
executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
// start execution
executionGraph.scheduleForExecution();
assertTrue(0L == restartingTime.getValue());
List<ExecutionAttemptID> executionIDs = new ArrayList<>();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
// tell execution graph that the tasks are in state running --> job status switches to state running
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(0L == restartingTime.getValue());
// fail the job so that it goes into state restarting
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
// wait some time so that the restarting time gauge shows a value different from 0
Thread.sleep(50);
long previousRestartingTime = restartingTime.getValue();
// check that the restarting time is monotonically increasing
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// check that we have measured some restarting time
assertTrue(previousRestartingTime > 0);
// restart job
testingRestartStrategy.restartExecutionGraph();
executionIDs.clear();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(firstRestartingTimestamp != 0);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time does not increase after we've reached the running state
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// fail job again
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
Thread.sleep(50);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time is increasing again
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
assertTrue(previousRestartingTime > 0);
// now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
// for this to work, we have to use a SuppressRestartException
executionGraph.fail(new SuppressRestartsException(new Exception()));
assertEquals(JobStatus.FAILED, executionGraph.getState());
previousRestartingTime = restartingTime.getValue();
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
} finally {
executor.shutdownNow();
}
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class AvailableSlotsTest method testPollFreeSlot.
@Test
public void testPollFreeSlot() {
SlotPool.AvailableSlots availableSlots = new SlotPool.AvailableSlots();
final ResourceID resource1 = new ResourceID("resource1");
final AllocatedSlot slot1 = createAllocatedSlot(resource1);
availableSlots.add(slot1, 1L);
assertEquals(1, availableSlots.size());
assertTrue(availableSlots.contains(slot1.getSlotAllocationId()));
assertTrue(availableSlots.containsTaskManager(resource1));
assertNull(availableSlots.poll(DEFAULT_TESTING_BIG_PROFILE, null));
SlotAndLocality slotAndLocality = availableSlots.poll(DEFAULT_TESTING_PROFILE, null);
assertEquals(slot1, slotAndLocality.slot());
assertEquals(0, availableSlots.size());
assertFalse(availableSlots.contains(slot1.getSlotAllocationId()));
assertFalse(availableSlots.containsTaskManager(resource1));
}
use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.
the class AvailableSlotsTest method testAddAndRemove.
@Test
public void testAddAndRemove() throws Exception {
SlotPool.AvailableSlots availableSlots = new SlotPool.AvailableSlots();
final ResourceID resource1 = new ResourceID("resource1");
final ResourceID resource2 = new ResourceID("resource2");
final AllocatedSlot slot1 = createAllocatedSlot(resource1);
final AllocatedSlot slot2 = createAllocatedSlot(resource1);
final AllocatedSlot slot3 = createAllocatedSlot(resource2);
availableSlots.add(slot1, 1L);
availableSlots.add(slot2, 2L);
availableSlots.add(slot3, 3L);
assertEquals(3, availableSlots.size());
assertTrue(availableSlots.contains(slot1.getSlotAllocationId()));
assertTrue(availableSlots.contains(slot2.getSlotAllocationId()));
assertTrue(availableSlots.contains(slot3.getSlotAllocationId()));
assertTrue(availableSlots.containsTaskManager(resource1));
assertTrue(availableSlots.containsTaskManager(resource2));
availableSlots.removeAllForTaskManager(resource1);
assertEquals(1, availableSlots.size());
assertFalse(availableSlots.contains(slot1.getSlotAllocationId()));
assertFalse(availableSlots.contains(slot2.getSlotAllocationId()));
assertTrue(availableSlots.contains(slot3.getSlotAllocationId()));
assertFalse(availableSlots.containsTaskManager(resource1));
assertTrue(availableSlots.containsTaskManager(resource2));
availableSlots.removeAllForTaskManager(resource2);
assertEquals(0, availableSlots.size());
assertFalse(availableSlots.contains(slot1.getSlotAllocationId()));
assertFalse(availableSlots.contains(slot2.getSlotAllocationId()));
assertFalse(availableSlots.contains(slot3.getSlotAllocationId()));
assertFalse(availableSlots.containsTaskManager(resource1));
assertFalse(availableSlots.containsTaskManager(resource2));
}
Aggregations