use of org.apache.flink.runtime.io.network.partition.ResultPartitionID in project flink by apache.
the class CancelPartitionRequestTest method testDuplicateCancel.
@Test
public void testDuplicateCancel() throws Exception {
NettyServerAndClient serverAndClient = null;
try {
final TestPooledBufferProvider outboundBuffers = new TestPooledBufferProvider(16);
ResultPartitionManager partitions = mock(ResultPartitionManager.class);
ResultPartitionID pid = new ResultPartitionID();
final CountDownLatch sync = new CountDownLatch(1);
final ResultSubpartitionView view = spy(new InfiniteSubpartitionView(outboundBuffers, sync));
// Return infinite subpartition
when(partitions.createSubpartitionView(eq(pid), eq(0), any(BufferProvider.class), any(BufferAvailabilityListener.class))).thenAnswer(new Answer<ResultSubpartitionView>() {
@Override
public ResultSubpartitionView answer(InvocationOnMock invocationOnMock) throws Throwable {
BufferAvailabilityListener listener = (BufferAvailabilityListener) invocationOnMock.getArguments()[3];
listener.notifyBuffersAvailable(Long.MAX_VALUE);
return view;
}
});
PartitionRequestProtocol protocol = new PartitionRequestProtocol(partitions, mock(TaskEventDispatcher.class), mock(NetworkBufferPool.class));
serverAndClient = initServerAndClient(protocol);
Channel ch = connect(serverAndClient);
// Request for non-existing input channel => results in cancel request
InputChannelID inputChannelId = new InputChannelID();
ch.writeAndFlush(new PartitionRequest(pid, 0, inputChannelId)).await();
// Wait for the notification
if (!sync.await(TestingUtils.TESTING_DURATION().toMillis(), TimeUnit.MILLISECONDS)) {
fail("Timed out after waiting for " + TestingUtils.TESTING_DURATION().toMillis() + " ms to be notified about cancelled partition.");
}
ch.writeAndFlush(new CancelPartitionRequest(inputChannelId)).await();
ch.close();
NettyTestUtil.awaitClose(ch);
verify(view, times(1)).releaseAllResources();
verify(view, times(0)).notifySubpartitionConsumed();
} finally {
shutdown(serverAndClient);
}
}
use of org.apache.flink.runtime.io.network.partition.ResultPartitionID in project flink by apache.
the class ClientTransportErrorHandlingTest method testExceptionOnWrite.
/**
* Verifies that failed client requests via {@link PartitionRequestClient} are correctly
* attributed to the respective {@link RemoteInputChannel}.
*/
@Test
public void testExceptionOnWrite() throws Exception {
NettyProtocol protocol = new NettyProtocol() {
@Override
public ChannelHandler[] getServerChannelHandlers() {
return new ChannelHandler[0];
}
@Override
public ChannelHandler[] getClientChannelHandlers() {
return new PartitionRequestProtocol(mock(ResultPartitionProvider.class), mock(TaskEventDispatcher.class), mock(NetworkBufferPool.class)).getClientChannelHandlers();
}
};
// We need a real server and client in this test, because Netty's EmbeddedChannel is
// not failing the ChannelPromise of failed writes.
NettyServerAndClient serverAndClient = initServerAndClient(protocol, createConfig());
Channel ch = connect(serverAndClient);
PartitionRequestClientHandler handler = getClientHandler(ch);
// Last outbound handler throws Exception after 1st write
ch.pipeline().addFirst(new ChannelOutboundHandlerAdapter() {
int writeNum = 0;
@Override
public void write(ChannelHandlerContext ctx, Object msg, ChannelPromise promise) throws Exception {
if (writeNum >= 1) {
throw new RuntimeException("Expected test exception.");
}
writeNum++;
ctx.write(msg, promise);
}
});
PartitionRequestClient requestClient = new PartitionRequestClient(ch, handler, mock(ConnectionID.class), mock(PartitionRequestClientFactory.class));
// Create input channels
RemoteInputChannel[] rich = new RemoteInputChannel[] { createRemoteInputChannel(), createRemoteInputChannel() };
final CountDownLatch sync = new CountDownLatch(1);
// Do this with explicit synchronization. Otherwise this is not robust against slow timings
// of the callback (e.g. we cannot just verify that it was called once, because there is
// a chance that we do this too early).
doAnswer(new Answer<Void>() {
@Override
public Void answer(InvocationOnMock invocation) throws Throwable {
sync.countDown();
return null;
}
}).when(rich[1]).onError(isA(LocalTransportException.class));
// First request is successful
ChannelFuture f = requestClient.requestSubpartition(new ResultPartitionID(), 0, rich[0], 0);
assertTrue(f.await().isSuccess());
// Second request is *not* successful
f = requestClient.requestSubpartition(new ResultPartitionID(), 0, rich[1], 0);
assertFalse(f.await().isSuccess());
// Only the second channel should be notified about the error
verify(rich[0], times(0)).onError(any(LocalTransportException.class));
// Wait for the notification
if (!sync.await(TestingUtils.TESTING_DURATION().toMillis(), TimeUnit.MILLISECONDS)) {
fail("Timed out after waiting for " + TestingUtils.TESTING_DURATION().toMillis() + " ms to be notified about the channel error.");
}
shutdown(serverAndClient);
}
use of org.apache.flink.runtime.io.network.partition.ResultPartitionID in project flink by apache.
the class TaskManagerTest method testCancellingDependentAndStateUpdateFails.
@Test
public void testCancellingDependentAndStateUpdateFails() {
// this tests creates two tasks. the sender sends data, and fails to send the
// state update back to the job manager
// the second one blocks to be canceled
new JavaTestKit(system) {
{
ActorGateway jobManager = null;
ActorGateway taskManager = null;
final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
try {
final JobID jid = new JobID();
JobVertexID vid1 = new JobVertexID();
JobVertexID vid2 = new JobVertexID();
final ExecutionAttemptID eid1 = new ExecutionAttemptID();
final ExecutionAttemptID eid2 = new ExecutionAttemptID();
ActorRef jm = system.actorOf(Props.create(new SimpleLookupFailingUpdateJobManagerCreator(leaderSessionID, eid2)));
jobManager = new AkkaActorGateway(jm, leaderSessionID);
taskManager = TestingUtils.createTaskManager(system, jobManager, new Configuration(), true, true);
final ActorGateway tm = taskManager;
IntermediateResultPartitionID partitionId = new IntermediateResultPartitionID();
List<ResultPartitionDeploymentDescriptor> irpdd = new ArrayList<ResultPartitionDeploymentDescriptor>();
irpdd.add(new ResultPartitionDeploymentDescriptor(new IntermediateDataSetID(), partitionId, ResultPartitionType.PIPELINED, 1, 1, true));
InputGateDeploymentDescriptor ircdd = new InputGateDeploymentDescriptor(new IntermediateDataSetID(), ResultPartitionType.PIPELINED, 0, new InputChannelDeploymentDescriptor[] { new InputChannelDeploymentDescriptor(new ResultPartitionID(partitionId, eid1), ResultPartitionLocation.createLocal()) });
final TaskDeploymentDescriptor tdd1 = createTaskDeploymentDescriptor(jid, "TestJob", vid1, eid1, new SerializedValue<>(new ExecutionConfig()), "Sender", 1, 0, 1, 0, new Configuration(), new Configuration(), Tasks.Sender.class.getName(), irpdd, Collections.<InputGateDeploymentDescriptor>emptyList(), new ArrayList<BlobKey>(), Collections.<URL>emptyList(), 0);
final TaskDeploymentDescriptor tdd2 = createTaskDeploymentDescriptor(jid, "TestJob", vid2, eid2, new SerializedValue<>(new ExecutionConfig()), "Receiver", 7, 2, 7, 0, new Configuration(), new Configuration(), Tasks.BlockingReceiver.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.singletonList(ircdd), new ArrayList<BlobKey>(), Collections.<URL>emptyList(), 0);
new Within(d) {
@Override
protected void run() {
try {
Future<Object> t1Running = tm.ask(new TestingTaskManagerMessages.NotifyWhenTaskIsRunning(eid1), timeout);
Future<Object> t2Running = tm.ask(new TestingTaskManagerMessages.NotifyWhenTaskIsRunning(eid2), timeout);
tm.tell(new SubmitTask(tdd2), testActorGateway);
tm.tell(new SubmitTask(tdd1), testActorGateway);
expectMsgEquals(Acknowledge.get());
expectMsgEquals(Acknowledge.get());
Await.ready(t1Running, d);
Await.ready(t2Running, d);
tm.tell(TestingTaskManagerMessages.getRequestRunningTasksMessage(), testActorGateway);
Map<ExecutionAttemptID, Task> tasks = expectMsgClass(TestingTaskManagerMessages.ResponseRunningTasks.class).asJava();
Task t1 = tasks.get(eid1);
Task t2 = tasks.get(eid2);
tm.tell(new CancelTask(eid2), testActorGateway);
expectMsgEquals(Acknowledge.get());
if (t2 != null) {
Future<Object> response = tm.ask(new TestingTaskManagerMessages.NotifyWhenTaskRemoved(eid2), timeout);
Await.ready(response, d);
}
if (t1 != null) {
if (t1.getExecutionState() == ExecutionState.RUNNING) {
tm.tell(new CancelTask(eid1), testActorGateway);
expectMsgEquals(Acknowledge.get());
}
Future<Object> response = tm.ask(new TestingTaskManagerMessages.NotifyWhenTaskRemoved(eid1), timeout);
Await.ready(response, d);
}
tm.tell(TestingTaskManagerMessages.getRequestRunningTasksMessage(), testActorGateway);
tasks = expectMsgClass(TestingTaskManagerMessages.ResponseRunningTasks.class).asJava();
assertEquals(0, tasks.size());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
};
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
// shut down the actors
TestingUtils.stopActor(taskManager);
TestingUtils.stopActor(jobManager);
}
}
};
}
use of org.apache.flink.runtime.io.network.partition.ResultPartitionID in project flink by apache.
the class TaskManagerTest method testRemotePartitionNotFound.
/**
* Tests that repeated remote {@link PartitionNotFoundException}s ultimately fail the receiver.
*/
@Test
public void testRemotePartitionNotFound() throws Exception {
new JavaTestKit(system) {
{
ActorGateway jobManager = null;
ActorGateway taskManager = null;
final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
try {
final IntermediateDataSetID resultId = new IntermediateDataSetID();
// Create the JM
ActorRef jm = system.actorOf(Props.create(new SimplePartitionStateLookupJobManagerCreator(leaderSessionID, getTestActor())));
jobManager = new AkkaActorGateway(jm, leaderSessionID);
final int dataPort = NetUtils.getAvailablePort();
Configuration config = new Configuration();
config.setInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, dataPort);
config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_INITIAL, 100);
config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_MAX, 200);
taskManager = TestingUtils.createTaskManager(system, jobManager, config, false, true);
// ---------------------------------------------------------------------------------
final ActorGateway tm = taskManager;
final JobID jid = new JobID();
final JobVertexID vid = new JobVertexID();
final ExecutionAttemptID eid = new ExecutionAttemptID();
final ResultPartitionID partitionId = new ResultPartitionID();
// Remote location (on the same TM though) for the partition
final ResultPartitionLocation loc = ResultPartitionLocation.createRemote(new ConnectionID(new InetSocketAddress("localhost", dataPort), 0));
final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[] { new InputChannelDeploymentDescriptor(partitionId, loc) };
final InputGateDeploymentDescriptor igdd = new InputGateDeploymentDescriptor(resultId, ResultPartitionType.PIPELINED, 0, icdd);
final TaskDeploymentDescriptor tdd = createTaskDeploymentDescriptor(jid, "TestJob", vid, eid, new SerializedValue<>(new ExecutionConfig()), "Receiver", 1, 0, 1, 0, new Configuration(), new Configuration(), Tasks.AgnosticReceiver.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.singletonList(igdd), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), 0);
new Within(d) {
@Override
protected void run() {
// Submit the task
tm.tell(new SubmitTask(tdd), testActorGateway);
expectMsgClass(Acknowledge.get().getClass());
// Wait to be notified about the final execution state by the mock JM
TaskExecutionState msg = expectMsgClass(TaskExecutionState.class);
// The task should fail after repeated requests
assertEquals(ExecutionState.FAILED, msg.getExecutionState());
Throwable t = msg.getError(ClassLoader.getSystemClassLoader());
assertEquals("Thrown exception was not a PartitionNotFoundException: " + t.getMessage(), PartitionNotFoundException.class, t.getClass());
}
};
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
TestingUtils.stopActor(taskManager);
TestingUtils.stopActor(jobManager);
}
}
};
}
use of org.apache.flink.runtime.io.network.partition.ResultPartitionID in project flink by apache.
the class TaskTest method testTriggerPartitionStateUpdate.
/**
* Tests the trigger partition state update future completions.
*/
@Test
public void testTriggerPartitionStateUpdate() throws Exception {
IntermediateDataSetID resultId = new IntermediateDataSetID();
ResultPartitionID partitionId = new ResultPartitionID();
LibraryCacheManager libCache = mock(LibraryCacheManager.class);
when(libCache.getClassLoader(any(JobID.class))).thenReturn(getClass().getClassLoader());
PartitionProducerStateChecker partitionChecker = mock(PartitionProducerStateChecker.class);
ResultPartitionConsumableNotifier consumableNotifier = mock(ResultPartitionConsumableNotifier.class);
NetworkEnvironment network = mock(NetworkEnvironment.class);
when(network.getResultPartitionManager()).thenReturn(mock(ResultPartitionManager.class));
when(network.getDefaultIOMode()).thenReturn(IOManager.IOMode.SYNC);
when(network.createKvStateTaskRegistry(any(JobID.class), any(JobVertexID.class))).thenReturn(mock(TaskKvStateRegistry.class));
createTask(InvokableBlockingInInvoke.class, libCache, network, consumableNotifier, partitionChecker, Executors.directExecutor());
// Test all branches of trigger partition state check
{
// Reset latches
createQueuesAndActors();
// PartitionProducerDisposedException
Task task = createTask(InvokableBlockingInInvoke.class, libCache, network, consumableNotifier, partitionChecker, Executors.directExecutor());
FlinkCompletableFuture<ExecutionState> promise = new FlinkCompletableFuture<>();
when(partitionChecker.requestPartitionProducerState(eq(task.getJobID()), eq(resultId), eq(partitionId))).thenReturn(promise);
task.triggerPartitionProducerStateCheck(task.getJobID(), resultId, partitionId);
promise.completeExceptionally(new PartitionProducerDisposedException(partitionId));
assertEquals(ExecutionState.CANCELING, task.getExecutionState());
}
{
// Reset latches
createQueuesAndActors();
// Any other exception
Task task = createTask(InvokableBlockingInInvoke.class, libCache, network, consumableNotifier, partitionChecker, Executors.directExecutor());
FlinkCompletableFuture<ExecutionState> promise = new FlinkCompletableFuture<>();
when(partitionChecker.requestPartitionProducerState(eq(task.getJobID()), eq(resultId), eq(partitionId))).thenReturn(promise);
task.triggerPartitionProducerStateCheck(task.getJobID(), resultId, partitionId);
promise.completeExceptionally(new RuntimeException("Any other exception"));
assertEquals(ExecutionState.FAILED, task.getExecutionState());
}
{
// Reset latches
createQueuesAndActors();
// TimeoutException handled special => retry
Task task = createTask(InvokableBlockingInInvoke.class, libCache, network, consumableNotifier, partitionChecker, Executors.directExecutor());
SingleInputGate inputGate = mock(SingleInputGate.class);
when(inputGate.getConsumedResultId()).thenReturn(resultId);
try {
task.startTaskThread();
awaitLatch.await();
setInputGate(task, inputGate);
FlinkCompletableFuture<ExecutionState> promise = new FlinkCompletableFuture<>();
when(partitionChecker.requestPartitionProducerState(eq(task.getJobID()), eq(resultId), eq(partitionId))).thenReturn(promise);
task.triggerPartitionProducerStateCheck(task.getJobID(), resultId, partitionId);
promise.completeExceptionally(new TimeoutException());
assertEquals(ExecutionState.RUNNING, task.getExecutionState());
verify(inputGate, times(1)).retriggerPartitionRequest(eq(partitionId.getPartitionId()));
} finally {
task.getExecutingThread().interrupt();
task.getExecutingThread().join();
}
}
{
// Reset latches
createQueuesAndActors();
// Success
Task task = createTask(InvokableBlockingInInvoke.class, libCache, network, consumableNotifier, partitionChecker, Executors.directExecutor());
SingleInputGate inputGate = mock(SingleInputGate.class);
when(inputGate.getConsumedResultId()).thenReturn(resultId);
try {
task.startTaskThread();
awaitLatch.await();
setInputGate(task, inputGate);
FlinkCompletableFuture<ExecutionState> promise = new FlinkCompletableFuture<>();
when(partitionChecker.requestPartitionProducerState(eq(task.getJobID()), eq(resultId), eq(partitionId))).thenReturn(promise);
task.triggerPartitionProducerStateCheck(task.getJobID(), resultId, partitionId);
promise.complete(ExecutionState.RUNNING);
assertEquals(ExecutionState.RUNNING, task.getExecutionState());
verify(inputGate, times(1)).retriggerPartitionRequest(eq(partitionId.getPartitionId()));
} finally {
task.getExecutingThread().interrupt();
task.getExecutingThread().join();
}
}
}
Aggregations