use of org.apache.flink.api.common.time.Time in project flink by apache.
the class JobLeaderIdServiceTest method testAddingJob.
/**
* Tests adding a job and finding out its leader id
*/
@Test(timeout = 10000)
public void testAddingJob() throws Exception {
final JobID jobId = new JobID();
final String address = "foobar";
final UUID leaderId = UUID.randomUUID();
TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
Time timeout = Time.milliseconds(5000L);
JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
jobLeaderIdService.start(jobLeaderIdActions);
jobLeaderIdService.addJob(jobId);
Future<UUID> leaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
// notify the leader id service about the new leader
leaderRetrievalService.notifyListener(address, leaderId);
assertEquals(leaderId, leaderIdFuture.get());
assertTrue(jobLeaderIdService.containsJob(jobId));
}
use of org.apache.flink.api.common.time.Time in project flink by apache.
the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.
/**
* This test tests that the restarting time metric correctly displays restarting times.
*/
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
try {
// setup execution graph with mocked scheduling logic
int parallelism = 1;
JobVertex jobVertex = new JobVertex("TestVertex");
jobVertex.setParallelism(parallelism);
jobVertex.setInvokableClass(NoOpInvokable.class);
JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
Configuration config = new Configuration();
config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
Configuration jobConfig = new Configuration();
Time timeout = Time.seconds(10L);
MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
assertTrue(metricRegistry.getReporters().size() == 1);
MetricReporter reporter = metricRegistry.getReporters().get(0);
assertTrue(reporter instanceof TestingReporter);
TestingReporter testingReporter = (TestingReporter) reporter;
MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
Scheduler scheduler = mock(Scheduler.class);
ResourceID taskManagerId = ResourceID.generate();
TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
when(taskManagerLocation.getHostname()).thenReturn("localhost");
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
Instance instance = mock(Instance.class);
when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(instance.getTaskManagerID()).thenReturn(taskManagerId);
when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
Slot rootSlot = mock(Slot.class);
AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
SimpleSlot simpleSlot = mock(SimpleSlot.class);
when(simpleSlot.isAlive()).thenReturn(true);
when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
when(simpleSlot.getRoot()).thenReturn(rootSlot);
when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
future.complete(simpleSlot);
when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
when(rootSlot.getSlotNumber()).thenReturn(0);
when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
// get restarting time metric
Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
assertNotNull(metric);
assertTrue(metric instanceof Gauge);
@SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
// check that the restarting time is 0 since it's the initial start
assertTrue(0L == restartingTime.getValue());
executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
// start execution
executionGraph.scheduleForExecution();
assertTrue(0L == restartingTime.getValue());
List<ExecutionAttemptID> executionIDs = new ArrayList<>();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
// tell execution graph that the tasks are in state running --> job status switches to state running
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(0L == restartingTime.getValue());
// fail the job so that it goes into state restarting
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
// wait some time so that the restarting time gauge shows a value different from 0
Thread.sleep(50);
long previousRestartingTime = restartingTime.getValue();
// check that the restarting time is monotonically increasing
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// check that we have measured some restarting time
assertTrue(previousRestartingTime > 0);
// restart job
testingRestartStrategy.restartExecutionGraph();
executionIDs.clear();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(firstRestartingTimestamp != 0);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time does not increase after we've reached the running state
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// fail job again
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
Thread.sleep(50);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time is increasing again
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
assertTrue(previousRestartingTime > 0);
// now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
// for this to work, we have to use a SuppressRestartException
executionGraph.fail(new SuppressRestartsException(new Exception()));
assertEquals(JobStatus.FAILED, executionGraph.getState());
previousRestartingTime = restartingTime.getValue();
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
} finally {
executor.shutdownNow();
}
}
use of org.apache.flink.api.common.time.Time in project flink by apache.
the class StackTraceSampleCoordinator method triggerStackTraceSample.
/**
* Triggers a stack trace sample to all tasks.
*
* @param tasksToSample Tasks to sample.
* @param numSamples Number of stack trace samples to collect.
* @param delayBetweenSamples Delay between consecutive samples.
* @param maxStackTraceDepth Maximum depth of the stack trace. 0 indicates
* no maximum and keeps the complete stack trace.
* @return A future of the completed stack trace sample
*/
@SuppressWarnings("unchecked")
public Future<StackTraceSample> triggerStackTraceSample(ExecutionVertex[] tasksToSample, int numSamples, Time delayBetweenSamples, int maxStackTraceDepth) {
checkNotNull(tasksToSample, "Tasks to sample");
checkArgument(tasksToSample.length >= 1, "No tasks to sample");
checkArgument(numSamples >= 1, "No number of samples");
checkArgument(maxStackTraceDepth >= 0, "Negative maximum stack trace depth");
// Execution IDs of running tasks
ExecutionAttemptID[] triggerIds = new ExecutionAttemptID[tasksToSample.length];
Execution[] executions = new Execution[tasksToSample.length];
// triggering can still fail.
for (int i = 0; i < triggerIds.length; i++) {
Execution execution = tasksToSample[i].getCurrentExecutionAttempt();
if (execution != null && execution.getState() == ExecutionState.RUNNING) {
executions[i] = execution;
triggerIds[i] = execution.getAttemptId();
} else {
return FlinkCompletableFuture.completedExceptionally(new IllegalStateException("Task " + tasksToSample[i].getTaskNameWithSubtaskIndex() + " is not running."));
}
}
synchronized (lock) {
if (isShutDown) {
return FlinkCompletableFuture.completedExceptionally(new IllegalStateException("Shut down"));
}
final int sampleId = sampleIdCounter++;
LOG.debug("Triggering stack trace sample {}", sampleId);
final PendingStackTraceSample pending = new PendingStackTraceSample(sampleId, triggerIds);
// Discard the sample if it takes too long. We don't send cancel
// messages to the task managers, but only wait for the responses
// and then ignore them.
long expectedDuration = numSamples * delayBetweenSamples.toMilliseconds();
Time timeout = Time.milliseconds(expectedDuration + sampleTimeout);
// Add the pending sample before scheduling the discard task to
// prevent races with removing it again.
pendingSamples.put(sampleId, pending);
// Trigger all samples
for (Execution execution : executions) {
final Future<StackTraceSampleResponse> stackTraceSampleFuture = execution.requestStackTraceSample(sampleId, numSamples, delayBetweenSamples, maxStackTraceDepth, timeout);
stackTraceSampleFuture.handleAsync(new BiFunction<StackTraceSampleResponse, Throwable, Void>() {
@Override
public Void apply(StackTraceSampleResponse stackTraceSampleResponse, Throwable throwable) {
if (stackTraceSampleResponse != null) {
collectStackTraces(stackTraceSampleResponse.getSampleId(), stackTraceSampleResponse.getExecutionAttemptID(), stackTraceSampleResponse.getSamples());
} else {
cancelStackTraceSample(sampleId, throwable);
}
return null;
}
}, executor);
}
return pending.getStackTraceSampleFuture();
}
}
use of org.apache.flink.api.common.time.Time in project flink by apache.
the class BackPressureStatsTrackerTest method testTriggerStackTraceSample.
/** Tests simple statistics with fake stack traces. */
@Test
@SuppressWarnings("unchecked")
public void testTriggerStackTraceSample() throws Exception {
CompletableFuture<StackTraceSample> sampleFuture = new FlinkCompletableFuture<>();
StackTraceSampleCoordinator sampleCoordinator = mock(StackTraceSampleCoordinator.class);
when(sampleCoordinator.triggerStackTraceSample(any(ExecutionVertex[].class), anyInt(), any(Time.class), anyInt())).thenReturn(sampleFuture);
ExecutionGraph graph = mock(ExecutionGraph.class);
when(graph.getState()).thenReturn(JobStatus.RUNNING);
// Same Thread execution context
when(graph.getFutureExecutor()).thenReturn(new Executor() {
@Override
public void execute(Runnable runnable) {
runnable.run();
}
});
ExecutionVertex[] taskVertices = new ExecutionVertex[4];
ExecutionJobVertex jobVertex = mock(ExecutionJobVertex.class);
when(jobVertex.getJobId()).thenReturn(new JobID());
when(jobVertex.getJobVertexId()).thenReturn(new JobVertexID());
when(jobVertex.getGraph()).thenReturn(graph);
when(jobVertex.getTaskVertices()).thenReturn(taskVertices);
taskVertices[0] = mockExecutionVertex(jobVertex, 0);
taskVertices[1] = mockExecutionVertex(jobVertex, 1);
taskVertices[2] = mockExecutionVertex(jobVertex, 2);
taskVertices[3] = mockExecutionVertex(jobVertex, 3);
int numSamples = 100;
Time delayBetweenSamples = Time.milliseconds(100L);
BackPressureStatsTracker tracker = new BackPressureStatsTracker(sampleCoordinator, 9999, numSamples, delayBetweenSamples);
// Trigger
assertTrue("Failed to trigger", tracker.triggerStackTraceSample(jobVertex));
verify(sampleCoordinator).triggerStackTraceSample(eq(taskVertices), eq(numSamples), eq(delayBetweenSamples), eq(BackPressureStatsTracker.MAX_STACK_TRACE_DEPTH));
// Trigger again for pending request, should not fire
assertFalse("Unexpected trigger", tracker.triggerStackTraceSample(jobVertex));
assertTrue(tracker.getOperatorBackPressureStats(jobVertex).isEmpty());
verify(sampleCoordinator).triggerStackTraceSample(eq(taskVertices), eq(numSamples), eq(delayBetweenSamples), eq(BackPressureStatsTracker.MAX_STACK_TRACE_DEPTH));
assertTrue(tracker.getOperatorBackPressureStats(jobVertex).isEmpty());
// Complete the future
Map<ExecutionAttemptID, List<StackTraceElement[]>> traces = new HashMap<>();
for (ExecutionVertex vertex : taskVertices) {
List<StackTraceElement[]> taskTraces = new ArrayList<>();
for (int i = 0; i < taskVertices.length; i++) {
// Traces until sub task index are back pressured
taskTraces.add(createStackTrace(i <= vertex.getParallelSubtaskIndex()));
}
traces.put(vertex.getCurrentExecutionAttempt().getAttemptId(), taskTraces);
}
int sampleId = 1231;
int endTime = 841;
StackTraceSample sample = new StackTraceSample(sampleId, 0, endTime, traces);
// Succeed the promise
sampleFuture.complete(sample);
assertTrue(tracker.getOperatorBackPressureStats(jobVertex).isDefined());
OperatorBackPressureStats stats = tracker.getOperatorBackPressureStats(jobVertex).get();
// Verify the stats
assertEquals(sampleId, stats.getSampleId());
assertEquals(endTime, stats.getEndTimestamp());
assertEquals(taskVertices.length, stats.getNumberOfSubTasks());
for (int i = 0; i < taskVertices.length; i++) {
double ratio = stats.getBackPressureRatio(i);
// Traces until sub task index are back pressured
assertEquals((i + 1) / ((double) 4), ratio, 0.0);
}
}
use of org.apache.flink.api.common.time.Time in project flink by apache.
the class ResourceManagerConfiguration method fromConfiguration.
// --------------------------------------------------------------------------
// Static factory methods
// --------------------------------------------------------------------------
public static ResourceManagerConfiguration fromConfiguration(Configuration configuration) throws ConfigurationException {
final String strTimeout = configuration.getString(AkkaOptions.AKKA_ASK_TIMEOUT);
final Time timeout;
try {
timeout = Time.milliseconds(Duration.apply(strTimeout).toMillis());
} catch (NumberFormatException e) {
throw new ConfigurationException("Could not parse the resource manager's timeout " + "value " + AkkaOptions.AKKA_ASK_TIMEOUT + '.', e);
}
final String strHeartbeatInterval = configuration.getString(AkkaOptions.AKKA_WATCH_HEARTBEAT_INTERVAL);
final Time heartbeatInterval;
try {
heartbeatInterval = Time.milliseconds(Duration.apply(strHeartbeatInterval).toMillis());
} catch (NumberFormatException e) {
throw new ConfigurationException("Could not parse the resource manager's heartbeat interval " + "value " + AkkaOptions.AKKA_WATCH_HEARTBEAT_INTERVAL + '.', e);
}
final String strJobTimeout = configuration.getString(ResourceManagerOptions.JOB_TIMEOUT);
final Time jobTimeout;
try {
jobTimeout = Time.milliseconds(Duration.apply(strJobTimeout).toMillis());
} catch (NumberFormatException e) {
throw new ConfigurationException("Could not parse the resource manager's job timeout " + "value " + ResourceManagerOptions.JOB_TIMEOUT + '.', e);
}
return new ResourceManagerConfiguration(timeout, heartbeatInterval, jobTimeout);
}
Aggregations