use of org.apache.flink.runtime.io.network.partition.PartitionNotFoundException in project flink by apache.
the class LocalInputChannelTest method testConcurrentReleaseAndRetriggerPartitionRequest.
/**
* Verifies that concurrent release via the SingleInputGate and re-triggering
* of a partition request works smoothly.
*
* - SingleInputGate acquires its request lock and tries to release all
* registered channels. When releasing a channel, it needs to acquire
* the channel's shared request-release lock.
* - If a LocalInputChannel concurrently retriggers a partition request via
* a Timer Thread it acquires the channel's request-release lock and calls
* the retrigger callback on the SingleInputGate, which again tries to
* acquire the gate's request lock.
*
* For certain timings this obviously leads to a deadlock. This test reliably
* reproduced such a timing (reported in FLINK-5228). This test is pretty much
* testing the buggy implementation and has not much more general value. If it
* becomes obsolete at some point (future greatness ;)), feel free to remove it.
*
* The fix in the end was to to not acquire the channels lock when releasing it
* and/or not doing any input gate callbacks while holding the channel's lock.
* I decided to do both.
*/
@Test
public void testConcurrentReleaseAndRetriggerPartitionRequest() throws Exception {
final SingleInputGate gate = new SingleInputGate("test task name", new JobID(), new IntermediateDataSetID(), ResultPartitionType.PIPELINED, 0, 1, mock(TaskActions.class), new UnregisteredTaskMetricsGroup.DummyTaskIOMetricGroup());
ResultPartitionManager partitionManager = mock(ResultPartitionManager.class);
when(partitionManager.createSubpartitionView(any(ResultPartitionID.class), anyInt(), any(BufferProvider.class), any(BufferAvailabilityListener.class))).thenAnswer(new Answer<ResultSubpartitionView>() {
@Override
public ResultSubpartitionView answer(InvocationOnMock invocationOnMock) throws Throwable {
// Sleep here a little to give the releaser Thread
// time to acquire the input gate lock. We throw
// the Exception to retrigger the request.
Thread.sleep(100);
throw new PartitionNotFoundException(new ResultPartitionID());
}
});
final LocalInputChannel channel = new LocalInputChannel(gate, 0, new ResultPartitionID(), partitionManager, new TaskEventDispatcher(), 1, 1, new UnregisteredTaskMetricsGroup.DummyTaskIOMetricGroup());
gate.setInputChannel(new IntermediateResultPartitionID(), channel);
Thread releaser = new Thread() {
@Override
public void run() {
try {
gate.releaseAllResources();
} catch (IOException ignored) {
}
}
};
Thread requester = new Thread() {
@Override
public void run() {
try {
channel.requestSubpartition(0);
} catch (IOException | InterruptedException ignored) {
}
}
};
requester.start();
releaser.start();
releaser.join();
requester.join();
}
use of org.apache.flink.runtime.io.network.partition.PartitionNotFoundException in project flink by apache.
the class LocalInputChannelTest method testPartitionRequestExponentialBackoff.
@Test
public void testPartitionRequestExponentialBackoff() throws Exception {
// Config
Tuple2<Integer, Integer> backoff = new Tuple2<>(500, 3000);
// Start with initial backoff, then keep doubling, and cap at max.
int[] expectedDelays = { backoff._1(), 1000, 2000, backoff._2() };
// Setup
SingleInputGate inputGate = mock(SingleInputGate.class);
BufferProvider bufferProvider = mock(BufferProvider.class);
when(inputGate.getBufferProvider()).thenReturn(bufferProvider);
ResultPartitionManager partitionManager = mock(ResultPartitionManager.class);
LocalInputChannel ch = createLocalInputChannel(inputGate, partitionManager, backoff);
when(partitionManager.createSubpartitionView(eq(ch.partitionId), eq(0), eq(bufferProvider), any(BufferAvailabilityListener.class))).thenThrow(new PartitionNotFoundException(ch.partitionId));
Timer timer = mock(Timer.class);
doAnswer(new Answer<Void>() {
@Override
public Void answer(InvocationOnMock invocation) throws Throwable {
((TimerTask) invocation.getArguments()[0]).run();
return null;
}
}).when(timer).schedule(any(TimerTask.class), anyLong());
// Initial request
ch.requestSubpartition(0);
verify(partitionManager).createSubpartitionView(eq(ch.partitionId), eq(0), eq(bufferProvider), any(BufferAvailabilityListener.class));
// Request subpartition and verify that the actual requests are delayed.
for (long expected : expectedDelays) {
ch.retriggerSubpartitionRequest(timer, 0);
verify(timer).schedule(any(TimerTask.class), eq(expected));
}
// Exception after backoff is greater than the maximum backoff.
try {
ch.retriggerSubpartitionRequest(timer, 0);
ch.getNextBuffer();
fail("Did not throw expected exception.");
} catch (Exception expected) {
}
}
use of org.apache.flink.runtime.io.network.partition.PartitionNotFoundException in project flink by apache.
the class TaskManagerTest method testRemotePartitionNotFound.
/**
* Tests that repeated remote {@link PartitionNotFoundException}s ultimately fail the receiver.
*/
@Test
public void testRemotePartitionNotFound() throws Exception {
new JavaTestKit(system) {
{
ActorGateway jobManager = null;
ActorGateway taskManager = null;
final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
try {
final IntermediateDataSetID resultId = new IntermediateDataSetID();
// Create the JM
ActorRef jm = system.actorOf(Props.create(new SimplePartitionStateLookupJobManagerCreator(leaderSessionID, getTestActor())));
jobManager = new AkkaActorGateway(jm, leaderSessionID);
final int dataPort = NetUtils.getAvailablePort();
Configuration config = new Configuration();
config.setInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, dataPort);
config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_INITIAL, 100);
config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_MAX, 200);
taskManager = TestingUtils.createTaskManager(system, jobManager, config, false, true);
// ---------------------------------------------------------------------------------
final ActorGateway tm = taskManager;
final JobID jid = new JobID();
final JobVertexID vid = new JobVertexID();
final ExecutionAttemptID eid = new ExecutionAttemptID();
final ResultPartitionID partitionId = new ResultPartitionID();
// Remote location (on the same TM though) for the partition
final ResultPartitionLocation loc = ResultPartitionLocation.createRemote(new ConnectionID(new InetSocketAddress("localhost", dataPort), 0));
final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[] { new InputChannelDeploymentDescriptor(partitionId, loc) };
final InputGateDeploymentDescriptor igdd = new InputGateDeploymentDescriptor(resultId, ResultPartitionType.PIPELINED, 0, icdd);
final TaskDeploymentDescriptor tdd = createTaskDeploymentDescriptor(jid, "TestJob", vid, eid, new SerializedValue<>(new ExecutionConfig()), "Receiver", 1, 0, 1, 0, new Configuration(), new Configuration(), Tasks.AgnosticReceiver.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.singletonList(igdd), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), 0);
new Within(d) {
@Override
protected void run() {
// Submit the task
tm.tell(new SubmitTask(tdd), testActorGateway);
expectMsgClass(Acknowledge.get().getClass());
// Wait to be notified about the final execution state by the mock JM
TaskExecutionState msg = expectMsgClass(TaskExecutionState.class);
// The task should fail after repeated requests
assertEquals(ExecutionState.FAILED, msg.getExecutionState());
Throwable t = msg.getError(ClassLoader.getSystemClassLoader());
assertEquals("Thrown exception was not a PartitionNotFoundException: " + t.getMessage(), PartitionNotFoundException.class, t.getClass());
}
};
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
TestingUtils.stopActor(taskManager);
TestingUtils.stopActor(jobManager);
}
}
};
}
use of org.apache.flink.runtime.io.network.partition.PartitionNotFoundException in project flink by apache.
the class PartitionRequestServerHandler method channelRead0.
@Override
protected void channelRead0(ChannelHandlerContext ctx, NettyMessage msg) throws Exception {
try {
Class<?> msgClazz = msg.getClass();
// ----------------------------------------------------------------
if (msgClazz == PartitionRequest.class) {
PartitionRequest request = (PartitionRequest) msg;
LOG.debug("Read channel on {}: {}.", ctx.channel().localAddress(), request);
try {
SequenceNumberingViewReader reader = new SequenceNumberingViewReader(request.receiverId, outboundQueue);
reader.requestSubpartitionView(partitionProvider, request.partitionId, request.queueIndex, bufferPool);
} catch (PartitionNotFoundException notFound) {
respondWithError(ctx, notFound, request.receiverId);
}
} else // ----------------------------------------------------------------
if (msgClazz == TaskEventRequest.class) {
TaskEventRequest request = (TaskEventRequest) msg;
if (!taskEventDispatcher.publish(request.partitionId, request.event)) {
respondWithError(ctx, new IllegalArgumentException("Task event receiver not found."), request.receiverId);
}
} else if (msgClazz == CancelPartitionRequest.class) {
CancelPartitionRequest request = (CancelPartitionRequest) msg;
outboundQueue.cancel(request.receiverId);
} else if (msgClazz == CloseRequest.class) {
outboundQueue.close();
} else {
LOG.warn("Received unexpected client request: {}", msg);
}
} catch (Throwable t) {
respondWithError(ctx, t);
}
}
use of org.apache.flink.runtime.io.network.partition.PartitionNotFoundException in project flink by apache.
the class LocalInputChannel method requestSubpartition.
// ------------------------------------------------------------------------
// Consume
// ------------------------------------------------------------------------
@Override
void requestSubpartition(int subpartitionIndex) throws IOException, InterruptedException {
boolean retriggerRequest = false;
// The lock is required to request only once in the presence of retriggered requests.
synchronized (requestLock) {
checkState(!isReleased, "LocalInputChannel has been released already");
if (subpartitionView == null) {
LOG.debug("{}: Requesting LOCAL subpartition {} of partition {}.", this, subpartitionIndex, partitionId);
try {
ResultSubpartitionView subpartitionView = partitionManager.createSubpartitionView(partitionId, subpartitionIndex, inputGate.getBufferProvider(), this);
if (subpartitionView == null) {
throw new IOException("Error requesting subpartition.");
}
// make the subpartition view visible
this.subpartitionView = subpartitionView;
// check if the channel was released in the meantime
if (isReleased) {
subpartitionView.releaseAllResources();
this.subpartitionView = null;
}
} catch (PartitionNotFoundException notFound) {
if (increaseBackoff()) {
retriggerRequest = true;
} else {
throw notFound;
}
}
}
}
// input gate.
if (retriggerRequest) {
inputGate.retriggerPartitionRequest(partitionId.getPartitionId());
}
}
Aggregations