use of org.apache.flink.runtime.metrics.MetricRegistry in project flink by apache.
the class TaskExecutorITCase method testSlotAllocation.
@Test
public void testSlotAllocation() throws Exception {
TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
TestingHighAvailabilityServices testingHAServices = new TestingHighAvailabilityServices();
final Configuration configuration = new Configuration();
final ScheduledExecutorService scheduledExecutorService = new ScheduledThreadPoolExecutor(1);
final ResourceID taskManagerResourceId = new ResourceID("foobar");
final UUID rmLeaderId = UUID.randomUUID();
final TestingLeaderElectionService rmLeaderElectionService = new TestingLeaderElectionService();
final TestingLeaderRetrievalService rmLeaderRetrievalService = new TestingLeaderRetrievalService();
final String rmAddress = "rm";
final String jmAddress = "jm";
final UUID jmLeaderId = UUID.randomUUID();
final JobID jobId = new JobID();
final ResourceProfile resourceProfile = new ResourceProfile(1.0, 1);
testingHAServices.setResourceManagerLeaderElectionService(rmLeaderElectionService);
testingHAServices.setResourceManagerLeaderRetriever(rmLeaderRetrievalService);
testingHAServices.setJobMasterLeaderRetriever(jobId, new TestingLeaderRetrievalService(jmAddress, jmLeaderId));
TestingSerialRpcService rpcService = new TestingSerialRpcService();
ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.milliseconds(500L), Time.milliseconds(500L), Time.minutes(5L));
SlotManagerFactory slotManagerFactory = new DefaultSlotManager.Factory();
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHAServices, rpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
MetricRegistry metricRegistry = mock(MetricRegistry.class);
HeartbeatServices heartbeatServices = mock(HeartbeatServices.class, RETURNS_MOCKS);
final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(taskManagerResourceId, InetAddress.getLocalHost(), 1234);
final MemoryManager memoryManager = mock(MemoryManager.class);
final IOManager ioManager = mock(IOManager.class);
final NetworkEnvironment networkEnvironment = mock(NetworkEnvironment.class);
final TaskManagerMetricGroup taskManagerMetricGroup = mock(TaskManagerMetricGroup.class);
final BroadcastVariableManager broadcastVariableManager = mock(BroadcastVariableManager.class);
final FileCache fileCache = mock(FileCache.class);
final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(resourceProfile), new TimerService<AllocationID>(scheduledExecutorService, 100L));
final JobManagerTable jobManagerTable = new JobManagerTable();
final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
ResourceManager<ResourceID> resourceManager = new StandaloneResourceManager(rpcService, resourceManagerConfiguration, testingHAServices, slotManagerFactory, metricRegistry, jobLeaderIdService, testingFatalErrorHandler);
TaskExecutor taskExecutor = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpcService, memoryManager, ioManager, networkEnvironment, testingHAServices, heartbeatServices, metricRegistry, taskManagerMetricGroup, broadcastVariableManager, fileCache, taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
JobMasterGateway jmGateway = mock(JobMasterGateway.class);
when(jmGateway.registerTaskManager(any(String.class), any(TaskManagerLocation.class), eq(jmLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(taskManagerResourceId, 1234)));
when(jmGateway.getHostname()).thenReturn(jmAddress);
rpcService.registerGateway(rmAddress, resourceManager.getSelf());
rpcService.registerGateway(jmAddress, jmGateway);
final AllocationID allocationId = new AllocationID();
final SlotRequest slotRequest = new SlotRequest(jobId, allocationId, resourceProfile);
final SlotOffer slotOffer = new SlotOffer(allocationId, 0, resourceProfile);
try {
resourceManager.start();
taskExecutor.start();
// notify the RM that it is the leader
rmLeaderElectionService.isLeader(rmLeaderId);
// notify the TM about the new RM leader
rmLeaderRetrievalService.notifyListener(rmAddress, rmLeaderId);
Future<RegistrationResponse> registrationResponseFuture = resourceManager.registerJobManager(rmLeaderId, jmLeaderId, jmAddress, jobId);
RegistrationResponse registrationResponse = registrationResponseFuture.get();
assertTrue(registrationResponse instanceof JobMasterRegistrationSuccess);
resourceManager.requestSlot(jmLeaderId, rmLeaderId, slotRequest);
verify(jmGateway).offerSlots(eq(taskManagerResourceId), (Iterable<SlotOffer>) argThat(Matchers.contains(slotOffer)), eq(jmLeaderId), any(Time.class));
} finally {
if (testingFatalErrorHandler.hasExceptionOccurred()) {
testingFatalErrorHandler.rethrowError();
}
}
}
use of org.apache.flink.runtime.metrics.MetricRegistry in project flink by apache.
the class TaskManagerComponentsStartupShutdownTest method testComponentsStartupShutdown.
/**
* Makes sure that all components are shut down when the TaskManager
* actor is shut down.
*/
@Test
public void testComponentsStartupShutdown() {
final String[] TMP_DIR = new String[] { ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH };
final Time timeout = Time.seconds(100);
final int BUFFER_SIZE = 32 * 1024;
Configuration config = new Configuration();
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "200 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "1 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 1);
ActorSystem actorSystem = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(config);
final ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
FlinkResourceManager.startResourceManagerActors(config, actorSystem, LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager), StandaloneResourceManager.class);
final int numberOfSlots = 1;
// create the components for the TaskManager manually
final TaskManagerConfiguration tmConfig = new TaskManagerConfiguration(numberOfSlots, TMP_DIR, timeout, null, Time.milliseconds(500), Time.seconds(30), Time.seconds(10), // cleanup interval
1000000, config, // exit-jvm-on-fatal-error
false);
final NetworkEnvironmentConfiguration netConf = new NetworkEnvironmentConfiguration(32, BUFFER_SIZE, MemoryType.HEAP, IOManager.IOMode.SYNC, 0, 0, 2, 8, null);
ResourceID taskManagerId = ResourceID.generate();
final TaskManagerLocation connectionInfo = new TaskManagerLocation(taskManagerId, InetAddress.getLocalHost(), 10000);
final MemoryManager memManager = new MemoryManager(32 * BUFFER_SIZE, 1, BUFFER_SIZE, MemoryType.HEAP, false);
final IOManager ioManager = new IOManagerAsync(TMP_DIR);
final NetworkEnvironment network = new NetworkEnvironment(new NetworkBufferPool(netConf.numNetworkBuffers(), netConf.networkBufferSize(), netConf.memoryType()), new LocalConnectionManager(), new ResultPartitionManager(), new TaskEventDispatcher(), new KvStateRegistry(), null, netConf.ioMode(), netConf.partitionRequestInitialBackoff(), netConf.partitionRequestMaxBackoff(), netConf.networkBuffersPerChannel(), netConf.extraNetworkBuffersPerGate());
network.start();
LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
MetricRegistryConfiguration metricRegistryConfiguration = MetricRegistryConfiguration.fromConfiguration(config);
// create the task manager
final Props tmProps = Props.create(TaskManager.class, tmConfig, taskManagerId, connectionInfo, memManager, ioManager, network, numberOfSlots, leaderRetrievalService, new MetricRegistry(metricRegistryConfiguration));
final ActorRef taskManager = actorSystem.actorOf(tmProps);
new JavaTestKit(actorSystem) {
{
// wait for the TaskManager to be registered
new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {
@Override
protected void run() {
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
}
};
}
};
// shut down all actors and the actor system
// Kill the Task down the JobManager
taskManager.tell(Kill.getInstance(), ActorRef.noSender());
jobManager.tell(Kill.getInstance(), ActorRef.noSender());
// shut down the actors and the actor system
actorSystem.shutdown();
actorSystem.awaitTermination();
actorSystem = null;
// now that the TaskManager is shut down, the components should be shut down as well
assertTrue(network.isShutdown());
assertTrue(ioManager.isProperlyShutDown());
assertTrue(memManager.isShutdown());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
}
}
use of org.apache.flink.runtime.metrics.MetricRegistry in project flink by apache.
the class JobManagerHARecoveryTest method testFailingJobRecovery.
/**
* Tests that a failing job recovery won't cause other job recoveries to fail.
*/
@Test
public void testFailingJobRecovery() throws Exception {
final FiniteDuration timeout = new FiniteDuration(10, TimeUnit.SECONDS);
final FiniteDuration jobRecoveryTimeout = new FiniteDuration(0, TimeUnit.SECONDS);
Deadline deadline = new FiniteDuration(1, TimeUnit.MINUTES).fromNow();
final Configuration flinkConfiguration = new Configuration();
UUID leaderSessionID = UUID.randomUUID();
ActorRef jobManager = null;
JobID jobId1 = new JobID();
JobID jobId2 = new JobID();
// set HA mode to zookeeper so that we try to recover jobs
flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
try {
final SubmittedJobGraphStore submittedJobGraphStore = mock(SubmittedJobGraphStore.class);
SubmittedJobGraph submittedJobGraph = mock(SubmittedJobGraph.class);
when(submittedJobGraph.getJobId()).thenReturn(jobId2);
when(submittedJobGraphStore.getJobIds()).thenReturn(Arrays.asList(jobId1, jobId2));
// fail the first job recovery
when(submittedJobGraphStore.recoverJobGraph(eq(jobId1))).thenThrow(new Exception("Test exception"));
// succeed the second job recovery
when(submittedJobGraphStore.recoverJobGraph(eq(jobId2))).thenReturn(submittedJobGraph);
final TestingLeaderElectionService myLeaderElectionService = new TestingLeaderElectionService();
final Collection<JobID> recoveredJobs = new ArrayList<>(2);
Props jobManagerProps = Props.create(TestingFailingHAJobManager.class, flinkConfiguration, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), mock(InstanceManager.class), mock(Scheduler.class), new BlobLibraryCacheManager(mock(BlobService.class), 1 << 20), ActorRef.noSender(), new FixedDelayRestartStrategy.FixedDelayRestartStrategyFactory(Int.MaxValue(), 100), timeout, myLeaderElectionService, submittedJobGraphStore, mock(CheckpointRecoveryFactory.class), jobRecoveryTimeout, Option.<MetricRegistry>apply(null), recoveredJobs).withDispatcher(CallingThreadDispatcher.Id());
jobManager = system.actorOf(jobManagerProps);
Future<Object> started = Patterns.ask(jobManager, new Identify(42), deadline.timeLeft().toMillis());
Await.ready(started, deadline.timeLeft());
// make the job manager the leader --> this triggers the recovery of all jobs
myLeaderElectionService.isLeader(leaderSessionID);
// check that we have successfully recovered the second job
assertThat(recoveredJobs, containsInAnyOrder(jobId2));
} finally {
TestingUtils.stopActor(jobManager);
}
}
Aggregations