use of org.apache.tez.runtime.library.common.InputAttemptIdentifier in project tez by apache.
the class TestShuffleScheduler method testReducerHealth_4.
@Test(timeout = 60000)
public /**
* Scenario
* - reducer has progressed enough
* - failures have happened randomly in nodes, but tasks are completed
* - failures start happening after that in last fetch
* - no of attempts failing does not exceed maxFailedUniqueFetches (5)
* - Stalled
* Expected result
* - reducer is stalled. But since errors are not seen across multiple
* nodes, it is left to the AM to retart producer. Do not kill consumer.
*/
void testReducerHealth_4() throws IOException {
long startTime = System.currentTimeMillis() - 500000;
Shuffle shuffle = mock(Shuffle.class);
final ShuffleSchedulerForTest scheduler = createScheduler(startTime, 320, shuffle);
int totalProducerNodes = 20;
// Generate 320 events
for (int i = 0; i < 320; i++) {
CompositeInputAttemptIdentifier inputAttemptIdentifier = new CompositeInputAttemptIdentifier(i, 0, "attempt_", 1);
scheduler.addKnownMapOutput("host" + (i % totalProducerNodes), 10000, i, inputAttemptIdentifier);
}
// Tasks fail in 20% of nodes 3 times, but are able to proceed further
for (int i = 0; i < 64; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
MapOutput mapOutput = MapOutput.createMemoryMapOutput(inputAttemptIdentifier, mock(FetchedInputAllocatorOrderedGrouped.class), 100, false);
scheduler.copySucceeded(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), 100, 200, startTime + (i * 100), mapOutput, false);
}
// 319 succeeds
for (int i = 64; i < 319; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
MapOutput mapOutput = MapOutput.createMemoryMapOutput(inputAttemptIdentifier, mock(FetchedInputAllocatorOrderedGrouped.class), 100, false);
scheduler.copySucceeded(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), 100, 200, startTime + (i * 100), mapOutput, false);
}
// 1 fails (last fetch)
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(319, 0, "attempt_");
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (319 % totalProducerNodes), 10000, 319, 1), false, true, false);
// stall the shuffle (but within limits)
scheduler.lastProgressTime = System.currentTimeMillis() - 100000;
assertEquals(scheduler.remainingMaps.get(), 1);
// Retry for 3 more times
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (319 % totalProducerNodes), 10000, 319, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (319 % totalProducerNodes), 10000, 319, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (319 % totalProducerNodes), 10000, 319, 1), false, true, false);
// failedShufflesSinceLastCompletion has crossed the limits. 20% of other nodes had failures as
// well. However, it has failed only in one host. So this should proceed
// until AM decides to restart the producer.
verify(shuffle, times(0)).reportException(any(Throwable.class));
// stall the shuffle (but within limits)
scheduler.lastProgressTime = System.currentTimeMillis() - 300000;
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (319 % totalProducerNodes), 10000, 319, 1), false, true, false);
verify(shuffle, times(1)).reportException(any(Throwable.class));
}
use of org.apache.tez.runtime.library.common.InputAttemptIdentifier in project tez by apache.
the class TestShuffleScheduler method testShutdown.
@Test(timeout = 5000)
public void testShutdown() throws Exception {
InputContext inputContext = createTezInputContext();
Configuration conf = new TezConfiguration();
int numInputs = 10;
Shuffle shuffle = mock(Shuffle.class);
MergeManager mergeManager = mock(MergeManager.class);
final ShuffleSchedulerForTest scheduler = new ShuffleSchedulerForTest(inputContext, conf, numInputs, shuffle, mergeManager, mergeManager, System.currentTimeMillis(), null, false, 0, "srcName");
ExecutorService executor = Executors.newFixedThreadPool(1);
try {
Future<Void> executorFuture = executor.submit(new Callable<Void>() {
@Override
public Void call() throws Exception {
scheduler.start();
return null;
}
});
InputAttemptIdentifier[] identifiers = new InputAttemptIdentifier[numInputs];
for (int i = 0; i < numInputs; i++) {
CompositeInputAttemptIdentifier inputAttemptIdentifier = new CompositeInputAttemptIdentifier(i, 0, "attempt_", 1);
scheduler.addKnownMapOutput("host" + i, 10000, 1, inputAttemptIdentifier);
identifiers[i] = inputAttemptIdentifier;
}
MapHost[] mapHosts = new MapHost[numInputs];
int count = 0;
for (MapHost mh : scheduler.mapLocations.values()) {
mapHosts[count++] = mh;
}
// Copy succeeded for 1 less host
for (int i = 0; i < numInputs - 1; i++) {
MapOutput mapOutput = MapOutput.createMemoryMapOutput(identifiers[i], mock(FetchedInputAllocatorOrderedGrouped.class), 100, false);
scheduler.copySucceeded(identifiers[i], mapHosts[i], 20, 25, 100, mapOutput, false);
scheduler.freeHost(mapHosts[i]);
}
scheduler.close();
// Ensure the executor exits, and without an error.
executorFuture.get();
} finally {
scheduler.close();
executor.shutdownNow();
}
}
use of org.apache.tez.runtime.library.common.InputAttemptIdentifier in project tez by apache.
the class TestShuffleScheduler method testReducerHealth_2.
@Test(timeout = 60000)
public /**
* Scenario
* - reducer has progressed enough
* - failures start happening after that
* - no of attempts failing exceeds maxFailedUniqueFetches (5)
* - Has not stalled
* Expected result
* - Since reducer is not stalled, it should continue without error
*
* When reducer stalls, wait until enough retries are done and throw exception
*/
void testReducerHealth_2() throws IOException, InterruptedException {
long startTime = System.currentTimeMillis() - 500000;
Shuffle shuffle = mock(Shuffle.class);
final ShuffleSchedulerForTest scheduler = createScheduler(startTime, 320, shuffle);
int totalProducerNodes = 20;
// Generate 0-200 events
for (int i = 0; i < 200; i++) {
CompositeInputAttemptIdentifier inputAttemptIdentifier = new CompositeInputAttemptIdentifier(i, 0, "attempt_", 1);
scheduler.addKnownMapOutput("host" + (i % totalProducerNodes), 10000, i, inputAttemptIdentifier);
}
assertEquals(320, scheduler.remainingMaps.get());
// Generate 200-320 events with empty partitions
for (int i = 200; i < 320; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
scheduler.copySucceeded(inputAttemptIdentifier, null, 0, 0, 0, null, true);
}
// 120 are successful. so remaining is 200
assertEquals(200, scheduler.remainingMaps.get());
// 200 pending to be downloaded. Download 190.
for (int i = 0; i < 190; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
MapOutput mapOutput = MapOutput.createMemoryMapOutput(inputAttemptIdentifier, mock(FetchedInputAllocatorOrderedGrouped.class), 100, false);
scheduler.copySucceeded(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), 100, 200, startTime + (i * 100), mapOutput, false);
}
assertEquals(10, scheduler.remainingMaps.get());
// 10 fails
for (int i = 190; i < 200; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
}
// Shuffle has not stalled. so no issues.
verify(scheduler.reporter, times(0)).reportException(any(Throwable.class));
// stall shuffle
scheduler.lastProgressTime = System.currentTimeMillis() - 250000;
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(190, 0, "attempt_");
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (190 % totalProducerNodes), 10000, 190, 1), false, true, false);
// Even when it is stalled, need (320 - 300 = 20) * 3 = 60 failures
verify(scheduler.reporter, times(0)).reportException(any(Throwable.class));
assertEquals(11, scheduler.failedShufflesSinceLastCompletion);
// fail to download 50 more times across attempts
for (int i = 190; i < 200; i++) {
inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
}
assertEquals(61, scheduler.failedShufflesSinceLastCompletion);
assertEquals(10, scheduler.remainingMaps.get());
verify(shuffle, atLeast(0)).reportException(any(Throwable.class));
// fail another 30
for (int i = 110; i < 120; i++) {
inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
}
// Should fail now due to fetcherHealthy. (stall has already happened and
// these are the only pending tasks)
verify(shuffle, atLeast(1)).reportException(any(Throwable.class));
}
use of org.apache.tez.runtime.library.common.InputAttemptIdentifier in project tez by apache.
the class TestShuffleScheduler method testReducerHealth_7.
@Test(timeout = 60000)
public /**
* Scenario
* - reducer has not progressed enough
* - fetch fails >
* TEZ_RUNTIME_SHUFFLE_ACCEPTABLE_HOST_FETCH_FAILURE_FRACTION
* Expected result
* - fail the reducer
*/
void testReducerHealth_7() throws IOException {
long startTime = System.currentTimeMillis() - 500000;
Shuffle shuffle = mock(Shuffle.class);
final ShuffleSchedulerForTest scheduler = createScheduler(startTime, 320, shuffle);
int totalProducerNodes = 20;
// Generate 320 events
for (int i = 0; i < 320; i++) {
CompositeInputAttemptIdentifier inputAttemptIdentifier = new CompositeInputAttemptIdentifier(i, 0, "attempt_", 1);
scheduler.addKnownMapOutput("host" + (i % totalProducerNodes), 10000, i, inputAttemptIdentifier);
}
// 100 succeeds
for (int i = 0; i < 100; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
MapOutput mapOutput = MapOutput.createMemoryMapOutput(inputAttemptIdentifier, mock(FetchedInputAllocatorOrderedGrouped.class), 100, false);
scheduler.copySucceeded(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), 100, 200, startTime + (i * 100), mapOutput, false);
}
// 99 fails
for (int i = 100; i < 199; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
}
verify(shuffle, atLeast(1)).reportException(any(Throwable.class));
}
use of org.apache.tez.runtime.library.common.InputAttemptIdentifier in project tez by apache.
the class TestShuffleScheduler method testNumParallelScheduledFetchers.
@Test(timeout = 10000)
public void testNumParallelScheduledFetchers() throws IOException, InterruptedException {
InputContext inputContext = createTezInputContext();
Configuration conf = new TezConfiguration();
// Allow 10 parallel copies at once.
conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES, 10);
int numInputs = 50;
Shuffle shuffle = mock(Shuffle.class);
MergeManager mergeManager = mock(MergeManager.class);
final ShuffleSchedulerForTest scheduler = new ShuffleSchedulerForTest(inputContext, conf, numInputs, shuffle, mergeManager, mergeManager, System.currentTimeMillis(), null, false, 0, "srcName", true);
Future<Void> executorFuture = null;
ExecutorService executor = Executors.newFixedThreadPool(1);
try {
executorFuture = executor.submit(new Callable<Void>() {
@Override
public Void call() throws Exception {
scheduler.start();
return null;
}
});
InputAttemptIdentifier[] identifiers = new InputAttemptIdentifier[numInputs];
// Schedule all copies.
for (int i = 0; i < numInputs; i++) {
CompositeInputAttemptIdentifier inputAttemptIdentifier = new CompositeInputAttemptIdentifier(i, 0, "attempt_", 1);
scheduler.addKnownMapOutput("host" + i, 10000, 1, inputAttemptIdentifier);
identifiers[i] = inputAttemptIdentifier;
}
// Sleep for a bit to allow the copies to be scheduled.
Thread.sleep(2000l);
assertEquals(10, scheduler.numFetchersCreated.get());
} finally {
scheduler.close();
if (executorFuture != null) {
executorFuture.cancel(true);
}
executor.shutdownNow();
}
}
Aggregations