use of org.apache.flink.api.common.time.Time in project flink by apache.
the class JobLeaderIdServiceTest method jobTimeoutAfterLostLeadership.
/**
* Tests that a timeout get cancelled once a job leader has been found. Furthermore, it tests
* that a new timeout is registered after the jobmanager has lost leadership.
*/
@Test(timeout = 10000)
public void jobTimeoutAfterLostLeadership() throws Exception {
final JobID jobId = new JobID();
final String address = "foobar";
final UUID leaderId = UUID.randomUUID();
TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
ScheduledFuture<?> timeout1 = mock(ScheduledFuture.class);
ScheduledFuture<?> timeout2 = mock(ScheduledFuture.class);
final Queue<ScheduledFuture<?>> timeoutQueue = new ArrayDeque<>(Arrays.asList(timeout1, timeout2));
ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
final AtomicReference<Runnable> lastRunnable = new AtomicReference<>();
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
lastRunnable.set((Runnable) invocation.getArguments()[0]);
return timeoutQueue.poll();
}
}).when(scheduledExecutor).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
Time timeout = Time.milliseconds(5000L);
JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
final AtomicReference<UUID> lastTimeoutId = new AtomicReference<>();
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
lastTimeoutId.set((UUID) invocation.getArguments()[1]);
return null;
}
}).when(jobLeaderIdActions).notifyJobTimeout(eq(jobId), any(UUID.class));
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
jobLeaderIdService.start(jobLeaderIdActions);
jobLeaderIdService.addJob(jobId);
Future<UUID> leaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
// notify the leader id service about the new leader
leaderRetrievalService.notifyListener(address, leaderId);
assertEquals(leaderId, leaderIdFuture.get());
assertTrue(jobLeaderIdService.containsJob(jobId));
// check that the first timeout got cancelled
verify(timeout1, times(1)).cancel(anyBoolean());
verify(scheduledExecutor, times(1)).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
// initial timeout runnable which should no longer have an effect
Runnable runnable = lastRunnable.get();
assertNotNull(runnable);
runnable.run();
verify(jobLeaderIdActions, times(1)).notifyJobTimeout(eq(jobId), any(UUID.class));
// the timeout should no longer be valid
assertFalse(jobLeaderIdService.isValidTimeout(jobId, lastTimeoutId.get()));
// lose leadership
leaderRetrievalService.notifyListener("", null);
verify(scheduledExecutor, times(2)).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
// the second runnable should be the new timeout
runnable = lastRunnable.get();
assertNotNull(runnable);
runnable.run();
verify(jobLeaderIdActions, times(2)).notifyJobTimeout(eq(jobId), any(UUID.class));
// the new timeout should be valid
assertTrue(jobLeaderIdService.isValidTimeout(jobId, lastTimeoutId.get()));
}
use of org.apache.flink.api.common.time.Time in project flink by apache.
the class JobLeaderIdServiceTest method testInitialJobTimeout.
/**
* Tests that the initial job registration registers a timeout which will call
* {@link JobLeaderIdActions#notifyJobTimeout(JobID, UUID)} when executed.
*/
@Test
public void testInitialJobTimeout() throws Exception {
final JobID jobId = new JobID();
TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
Time timeout = Time.milliseconds(5000L);
JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
jobLeaderIdService.start(jobLeaderIdActions);
jobLeaderIdService.addJob(jobId);
assertTrue(jobLeaderIdService.containsJob(jobId));
ArgumentCaptor<Runnable> runnableArgumentCaptor = ArgumentCaptor.forClass(Runnable.class);
verify(scheduledExecutor).schedule(runnableArgumentCaptor.capture(), anyLong(), any(TimeUnit.class));
Runnable timeoutRunnable = runnableArgumentCaptor.getValue();
timeoutRunnable.run();
ArgumentCaptor<UUID> timeoutIdArgumentCaptor = ArgumentCaptor.forClass(UUID.class);
verify(jobLeaderIdActions, times(1)).notifyJobTimeout(eq(jobId), timeoutIdArgumentCaptor.capture());
assertTrue(jobLeaderIdService.isValidTimeout(jobId, timeoutIdArgumentCaptor.getValue()));
}
use of org.apache.flink.api.common.time.Time in project flink by apache.
the class JobLeaderIdServiceTest method testRemovingJob.
/**
* Tests that removing a job completes the job leader id future exceptionally
*/
@Test(timeout = 10000)
public void testRemovingJob() throws Exception {
final JobID jobId = new JobID();
TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
Time timeout = Time.milliseconds(5000L);
JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
jobLeaderIdService.start(jobLeaderIdActions);
jobLeaderIdService.addJob(jobId);
Future<UUID> leaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
// remove the job before we could find a leader
jobLeaderIdService.removeJob(jobId);
assertFalse(jobLeaderIdService.containsJob(jobId));
try {
leaderIdFuture.get();
fail("The leader id future should be completed exceptionally.");
} catch (ExecutionException ignored) {
// expected exception
}
}
use of org.apache.flink.api.common.time.Time in project flink by apache.
the class TaskManagerComponentsStartupShutdownTest method testComponentsStartupShutdown.
/**
* Makes sure that all components are shut down when the TaskManager
* actor is shut down.
*/
@Test
public void testComponentsStartupShutdown() {
final String[] TMP_DIR = new String[] { ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH };
final Time timeout = Time.seconds(100);
final int BUFFER_SIZE = 32 * 1024;
Configuration config = new Configuration();
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "200 ms");
config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "1 s");
config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 1);
ActorSystem actorSystem = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(config);
final ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
FlinkResourceManager.startResourceManagerActors(config, actorSystem, LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager), StandaloneResourceManager.class);
final int numberOfSlots = 1;
// create the components for the TaskManager manually
final TaskManagerConfiguration tmConfig = new TaskManagerConfiguration(numberOfSlots, TMP_DIR, timeout, null, Time.milliseconds(500), Time.seconds(30), Time.seconds(10), // cleanup interval
1000000, config, // exit-jvm-on-fatal-error
false);
final NetworkEnvironmentConfiguration netConf = new NetworkEnvironmentConfiguration(32, BUFFER_SIZE, MemoryType.HEAP, IOManager.IOMode.SYNC, 0, 0, 2, 8, null);
ResourceID taskManagerId = ResourceID.generate();
final TaskManagerLocation connectionInfo = new TaskManagerLocation(taskManagerId, InetAddress.getLocalHost(), 10000);
final MemoryManager memManager = new MemoryManager(32 * BUFFER_SIZE, 1, BUFFER_SIZE, MemoryType.HEAP, false);
final IOManager ioManager = new IOManagerAsync(TMP_DIR);
final NetworkEnvironment network = new NetworkEnvironment(new NetworkBufferPool(netConf.numNetworkBuffers(), netConf.networkBufferSize(), netConf.memoryType()), new LocalConnectionManager(), new ResultPartitionManager(), new TaskEventDispatcher(), new KvStateRegistry(), null, netConf.ioMode(), netConf.partitionRequestInitialBackoff(), netConf.partitionRequestMaxBackoff(), netConf.networkBuffersPerChannel(), netConf.extraNetworkBuffersPerGate());
network.start();
LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
MetricRegistryConfiguration metricRegistryConfiguration = MetricRegistryConfiguration.fromConfiguration(config);
// create the task manager
final Props tmProps = Props.create(TaskManager.class, tmConfig, taskManagerId, connectionInfo, memManager, ioManager, network, numberOfSlots, leaderRetrievalService, new MetricRegistry(metricRegistryConfiguration));
final ActorRef taskManager = actorSystem.actorOf(tmProps);
new JavaTestKit(actorSystem) {
{
// wait for the TaskManager to be registered
new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {
@Override
protected void run() {
taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
}
};
}
};
// shut down all actors and the actor system
// Kill the Task down the JobManager
taskManager.tell(Kill.getInstance(), ActorRef.noSender());
jobManager.tell(Kill.getInstance(), ActorRef.noSender());
// shut down the actors and the actor system
actorSystem.shutdown();
actorSystem.awaitTermination();
actorSystem = null;
// now that the TaskManager is shut down, the components should be shut down as well
assertTrue(network.isShutdown());
assertTrue(ioManager.isProperlyShutDown());
assertTrue(memManager.isShutdown());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
}
}
use of org.apache.flink.api.common.time.Time in project flink by apache.
the class MiniCluster method start.
/**
* Starts the mini cluster, based on the configured properties.
*
* @throws Exception This method passes on any exception that occurs during the startup of
* the mini cluster.
*/
public void start() throws Exception {
synchronized (lock) {
checkState(!running, "FlinkMiniCluster is already running");
LOG.info("Starting Flink Mini Cluster");
LOG.debug("Using configuration {}", config);
final Configuration configuration = new UnmodifiableConfiguration(config.generateConfiguration());
final Time rpcTimeout = config.getRpcTimeout();
final int numJobManagers = config.getNumJobManagers();
final int numTaskManagers = config.getNumTaskManagers();
final int numResourceManagers = config.getNumResourceManagers();
final boolean singleRpc = config.getUseSingleRpcSystem();
try {
LOG.info("Starting Metrics Registry");
metricRegistry = createMetricRegistry(configuration);
RpcService[] jobManagerRpcServices = new RpcService[numJobManagers];
RpcService[] taskManagerRpcServices = new RpcService[numTaskManagers];
RpcService[] resourceManagerRpcServices = new RpcService[numResourceManagers];
// bring up all the RPC services
LOG.info("Starting RPC Service(s)");
// we always need the 'commonRpcService' for auxiliary calls
commonRpcService = createRpcService(configuration, rpcTimeout, false, null);
if (singleRpc) {
// set that same RPC service for all JobManagers and TaskManagers
for (int i = 0; i < numJobManagers; i++) {
jobManagerRpcServices[i] = commonRpcService;
}
for (int i = 0; i < numTaskManagers; i++) {
taskManagerRpcServices[i] = commonRpcService;
}
for (int i = 0; i < numResourceManagers; i++) {
resourceManagerRpcServices[i] = commonRpcService;
}
this.resourceManagerRpcServices = null;
this.jobManagerRpcServices = null;
this.taskManagerRpcServices = null;
} else {
// start a new service per component, possibly with custom bind addresses
final String jobManagerBindAddress = config.getJobManagerBindAddress();
final String taskManagerBindAddress = config.getTaskManagerBindAddress();
final String resourceManagerBindAddress = config.getResourceManagerBindAddress();
for (int i = 0; i < numJobManagers; i++) {
jobManagerRpcServices[i] = createRpcService(configuration, rpcTimeout, true, jobManagerBindAddress);
}
for (int i = 0; i < numTaskManagers; i++) {
taskManagerRpcServices[i] = createRpcService(configuration, rpcTimeout, true, taskManagerBindAddress);
}
for (int i = 0; i < numResourceManagers; i++) {
resourceManagerRpcServices[i] = createRpcService(configuration, rpcTimeout, true, resourceManagerBindAddress);
}
this.jobManagerRpcServices = jobManagerRpcServices;
this.taskManagerRpcServices = taskManagerRpcServices;
this.resourceManagerRpcServices = resourceManagerRpcServices;
}
// create the high-availability services
LOG.info("Starting high-availability services");
haServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(configuration);
heartbeatServices = HeartbeatServices.fromConfiguration(configuration);
// bring up the ResourceManager(s)
LOG.info("Starting {} ResourceManger(s)", numResourceManagers);
resourceManagerRunners = startResourceManagers(configuration, haServices, metricRegistry, numResourceManagers, resourceManagerRpcServices);
// bring up the TaskManager(s) for the mini cluster
LOG.info("Starting {} TaskManger(s)", numTaskManagers);
taskManagerRunners = startTaskManagers(configuration, haServices, metricRegistry, numTaskManagers, taskManagerRpcServices);
// bring up the dispatcher that launches JobManagers when jobs submitted
LOG.info("Starting job dispatcher(s) for {} JobManger(s)", numJobManagers);
jobDispatcher = new MiniClusterJobDispatcher(configuration, haServices, heartbeatServices, metricRegistry, numJobManagers, jobManagerRpcServices);
} catch (Exception e) {
// cleanup everything
try {
shutdownInternally();
} catch (Exception ee) {
e.addSuppressed(ee);
}
throw e;
}
// now officially mark this as running
running = true;
LOG.info("Flink Mini Cluster started successfully");
}
}
Aggregations