use of org.apache.tez.runtime.library.common.CompositeInputAttemptIdentifier in project tez by apache.
the class TestShuffleScheduler method testReducerHealth_3.
@Test(timeout = 60000)
public /**
* Scenario
* - reducer has progressed enough
* - failures start happening after that in last fetch
* - no of attempts failing does not exceed maxFailedUniqueFetches (5)
* - Stalled
* Expected result
* - Since reducer is stalled and if failures haven't happened across nodes,
* it should be fine to proceed. AM would restart source task eventually.
*/
void testReducerHealth_3() throws IOException {
long startTime = System.currentTimeMillis() - 500000;
Shuffle shuffle = mock(Shuffle.class);
final ShuffleSchedulerForTest scheduler = createScheduler(startTime, 320, shuffle);
int totalProducerNodes = 20;
// Generate 320 events
for (int i = 0; i < 320; i++) {
CompositeInputAttemptIdentifier inputAttemptIdentifier = new CompositeInputAttemptIdentifier(i, 0, "attempt_", 1);
scheduler.addKnownMapOutput("host" + (i % totalProducerNodes), 10000, i, inputAttemptIdentifier);
}
// 319 succeeds
for (int i = 0; i < 319; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
MapOutput mapOutput = MapOutput.createMemoryMapOutput(inputAttemptIdentifier, mock(FetchedInputAllocatorOrderedGrouped.class), 100, false);
scheduler.copySucceeded(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), 100, 200, startTime + (i * 100), mapOutput, false);
}
// 1 fails (last fetch)
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(319, 0, "attempt_");
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (319 % totalProducerNodes), 10000, 319, 1), false, true, false);
// stall the shuffle
scheduler.lastProgressTime = System.currentTimeMillis() - 1000000;
assertEquals(scheduler.remainingMaps.get(), 1);
// Retry for 3 more times
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (319 % totalProducerNodes), 10000, 319, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (319 % totalProducerNodes), 10000, 310, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (319 % totalProducerNodes), 10000, 310, 1), false, true, false);
// failedShufflesSinceLastCompletion has crossed the limits. Throw error
verify(shuffle, times(0)).reportException(any(Throwable.class));
}
use of org.apache.tez.runtime.library.common.CompositeInputAttemptIdentifier in project tez by apache.
the class TestShuffleScheduler method testShutdownWithInterrupt.
@Test(timeout = 30000)
public void testShutdownWithInterrupt() throws Exception {
InputContext inputContext = createTezInputContext();
Configuration conf = new TezConfiguration();
int numInputs = 10;
Shuffle shuffle = mock(Shuffle.class);
MergeManager mergeManager = mock(MergeManager.class);
final ShuffleSchedulerForTest scheduler = new ShuffleSchedulerForTest(inputContext, conf, numInputs, shuffle, mergeManager, mergeManager, System.currentTimeMillis(), null, false, 0, "srcName");
ExecutorService executor = Executors.newFixedThreadPool(1);
Future<Void> executorFuture = executor.submit(new Callable<Void>() {
@Override
public Void call() throws Exception {
scheduler.start();
return null;
}
});
InputAttemptIdentifier[] identifiers = new InputAttemptIdentifier[numInputs];
for (int i = 0; i < numInputs; i++) {
CompositeInputAttemptIdentifier inputAttemptIdentifier = new CompositeInputAttemptIdentifier(i, 0, "attempt_", 1);
scheduler.addKnownMapOutput("host" + i, 10000, 1, inputAttemptIdentifier);
identifiers[i] = inputAttemptIdentifier;
}
MapHost[] mapHosts = new MapHost[numInputs];
int count = 0;
for (MapHost mh : scheduler.mapLocations.values()) {
mapHosts[count++] = mh;
}
// Copy succeeded for 1 less host
for (int i = 0; i < numInputs - 1; i++) {
MapOutput mapOutput = MapOutput.createMemoryMapOutput(identifiers[i], mock(FetchedInputAllocatorOrderedGrouped.class), 100, false);
scheduler.copySucceeded(identifiers[i], mapHosts[i], 20, 25, 100, mapOutput, false);
scheduler.freeHost(mapHosts[i]);
}
try {
// Close the scheduler on different thread to trigger interrupt
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
scheduler.close();
}
});
thread.start();
thread.join();
} finally {
assertTrue("Fetcher executor should be shutdown, but still running", scheduler.hasFetcherExecutorStopped());
executor.shutdownNow();
}
}
use of org.apache.tez.runtime.library.common.CompositeInputAttemptIdentifier in project tez by apache.
the class TestShuffleScheduler method _testReducerHealth_6.
public void _testReducerHealth_6(Configuration conf) throws IOException {
long startTime = System.currentTimeMillis() - 500000;
Shuffle shuffle = mock(Shuffle.class);
final ShuffleSchedulerForTest scheduler = createScheduler(startTime, 320, shuffle, conf);
int totalProducerNodes = 20;
// Generate 320 events (last event has not arrived)
for (int i = 0; i < 320; i++) {
CompositeInputAttemptIdentifier inputAttemptIdentifier = new CompositeInputAttemptIdentifier(i, 0, "attempt_", 1);
scheduler.addKnownMapOutput("host" + (i % totalProducerNodes), 10000, i, inputAttemptIdentifier);
}
// 10 succeeds
for (int i = 0; i < 10; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
MapOutput mapOutput = MapOutput.createMemoryMapOutput(inputAttemptIdentifier, mock(FetchedInputAllocatorOrderedGrouped.class), 100, false);
scheduler.copySucceeded(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), 100, 200, startTime + (i * 100), mapOutput, false);
}
// 5 fetches fail once
for (int i = 10; i < 15; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
}
assertTrue(scheduler.failureCounts.size() >= 5);
assertEquals(scheduler.remainingMaps.get(), 310);
// Do not bail out (number of failures is just 5)
verify(scheduler.reporter, times(0)).reportException(any(Throwable.class));
// 5 fetches fail repeatedly
for (int i = 10; i < 15; i++) {
InputAttemptIdentifier inputAttemptIdentifier = new InputAttemptIdentifier(i, 0, "attempt_");
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
scheduler.copyFailed(inputAttemptIdentifier, new MapHost("host" + (i % totalProducerNodes), 10000, i, 1), false, true, false);
}
boolean checkFailedFetchSinceLastCompletion = conf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FAILED_CHECK_SINCE_LAST_COMPLETION, TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FAILED_CHECK_SINCE_LAST_COMPLETION_DEFAULT);
if (checkFailedFetchSinceLastCompletion) {
// Now bail out, as Shuffle has crossed the
// failedShufflesSinceLastCompletion limits. (even
// though reducerHeathly is
verify(shuffle, atLeast(1)).reportException(any(Throwable.class));
} else {
// Do not bail out yet.
verify(shuffle, atLeast(0)).reportException(any(Throwable.class));
}
}
use of org.apache.tez.runtime.library.common.CompositeInputAttemptIdentifier in project tez by apache.
the class TestFetcher method testSetupLocalDiskFetch.
@Test(timeout = 3000)
public void testSetupLocalDiskFetch() throws Exception {
CompositeInputAttemptIdentifier[] srcAttempts = { new CompositeInputAttemptIdentifier(0, 1, InputAttemptIdentifier.PATH_PREFIX + "pathComponent_0", 1), new CompositeInputAttemptIdentifier(1, 2, InputAttemptIdentifier.PATH_PREFIX + "pathComponent_1", 1), new CompositeInputAttemptIdentifier(2, 3, InputAttemptIdentifier.PATH_PREFIX + "pathComponent_2", 1), new CompositeInputAttemptIdentifier(3, 4, InputAttemptIdentifier.PATH_PREFIX + "pathComponent_3", 1), new CompositeInputAttemptIdentifier(4, 5, InputAttemptIdentifier.PATH_PREFIX + "pathComponent_4", 1) };
final int FIRST_FAILED_ATTEMPT_IDX = 2;
final int SECOND_FAILED_ATTEMPT_IDX = 4;
final int[] sucessfulAttempts = { 0, 1, 3 };
TezConfiguration conf = new TezConfiguration();
conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_OPTIMIZE_LOCAL_FETCH, "true");
int partition = 42;
FetcherCallback callback = mock(FetcherCallback.class);
Fetcher.FetcherBuilder builder = new Fetcher.FetcherBuilder(callback, null, null, ApplicationId.newInstance(0, 1), 1, null, "fetcherTest", conf, true, HOST, PORT, false, true, true);
ArrayList<InputAttemptIdentifier> inputAttemptIdentifiers = new ArrayList<>();
for (CompositeInputAttemptIdentifier compositeInputAttemptIdentifier : srcAttempts) {
for (int i = 0; i < compositeInputAttemptIdentifier.getInputIdentifierCount(); i++) {
inputAttemptIdentifiers.add(compositeInputAttemptIdentifier.expand(i));
}
}
ArrayList<InputAttemptIdentifier> list = new ArrayList<InputAttemptIdentifier>();
list.addAll(Arrays.asList(srcAttempts));
builder.assignWork(HOST, PORT, partition, 1, list);
Fetcher fetcher = spy(builder.build());
for (CompositeInputAttemptIdentifier compositeInputAttemptIdentifier : srcAttempts) {
for (int i = 0; i < compositeInputAttemptIdentifier.getInputIdentifierCount(); i++) {
inputAttemptIdentifiers.add(compositeInputAttemptIdentifier.expand(i));
Fetcher.PathPartition pathPartition = new Fetcher.PathPartition(compositeInputAttemptIdentifier.getPathComponent(), partition + i);
fetcher.getPathToAttemptMap().put(pathPartition, compositeInputAttemptIdentifier.expand(i));
}
}
doAnswer(new Answer<Path>() {
@Override
public Path answer(InvocationOnMock invocation) throws Throwable {
Object[] args = invocation.getArguments();
return new Path(SHUFFLE_INPUT_FILE_PREFIX + args[0]);
}
}).when(fetcher).getShuffleInputFileName(anyString(), anyString());
doAnswer(new Answer<TezIndexRecord>() {
@Override
public TezIndexRecord answer(InvocationOnMock invocation) throws Throwable {
Object[] args = invocation.getArguments();
InputAttemptIdentifier srcAttemptId = (InputAttemptIdentifier) args[0];
String pathComponent = srcAttemptId.getPathComponent();
int len = pathComponent.length();
long p = Long.valueOf(pathComponent.substring(len - 1, len));
// Fail the 3rd one and 5th one.
if (p == FIRST_FAILED_ATTEMPT_IDX || p == SECOND_FAILED_ATTEMPT_IDX) {
throw new IOException("failing on 3/5th input to simulate failure case");
}
// match with params for copySucceeded below.
return new TezIndexRecord(p * 10, p * 1000, p * 100);
}
}).when(fetcher).getTezIndexRecord(any(InputAttemptIdentifier.class), anyInt());
doNothing().when(fetcher).shutdown();
doNothing().when(callback).fetchSucceeded(anyString(), any(InputAttemptIdentifier.class), any(FetchedInput.class), anyLong(), anyLong(), anyLong());
doNothing().when(callback).fetchFailed(anyString(), any(InputAttemptIdentifier.class), eq(false));
FetchResult fetchResult = fetcher.call();
verify(fetcher).setupLocalDiskFetch();
// expect 3 sucesses and 2 failures
for (int i : sucessfulAttempts) {
verifyFetchSucceeded(callback, srcAttempts[i], conf);
}
verify(callback).fetchFailed(eq(HOST), eq(srcAttempts[FIRST_FAILED_ATTEMPT_IDX]), eq(false));
verify(callback).fetchFailed(eq(HOST), eq(srcAttempts[SECOND_FAILED_ATTEMPT_IDX]), eq(false));
Assert.assertEquals("fetchResult host", fetchResult.getHost(), HOST);
Assert.assertEquals("fetchResult partition", fetchResult.getPartition(), partition);
Assert.assertEquals("fetchResult port", fetchResult.getPort(), PORT);
// 3nd and 5th attempt failed
List<InputAttemptIdentifier> pendingInputs = Lists.newArrayList(fetchResult.getPendingInputs());
Assert.assertEquals("fetchResult pendingInput size", pendingInputs.size(), 2);
Assert.assertEquals("fetchResult failed attempt", pendingInputs.get(0), srcAttempts[FIRST_FAILED_ATTEMPT_IDX]);
Assert.assertEquals("fetchResult failed attempt", pendingInputs.get(1), srcAttempts[SECOND_FAILED_ATTEMPT_IDX]);
}
use of org.apache.tez.runtime.library.common.CompositeInputAttemptIdentifier in project tez by apache.
the class TestShuffleInputEventHandlerOrderedGrouped method testPiplinedShuffleEvents_WithOutofOrderAttempts.
@Test(timeout = 5000)
public void testPiplinedShuffleEvents_WithOutofOrderAttempts() throws IOException, InterruptedException {
// Process attempt #1 first
int attemptNum = 1;
int inputIdx = 1;
Event dme1 = createDataMovementEvent(attemptNum, inputIdx, null, false, true, true, 0, attemptNum);
handler.handleEvents(Collections.singletonList(dme1));
CompositeInputAttemptIdentifier id1 = new CompositeInputAttemptIdentifier(inputIdx, attemptNum, PATH_COMPONENT, false, InputAttemptIdentifier.SPILL_INFO.INCREMENTAL_UPDATE, 0, 1);
verify(scheduler, times(1)).addKnownMapOutput(eq(HOST), eq(PORT), eq(1), eq(id1));
assertTrue("Shuffle info events should not be empty for pipelined shuffle", !scheduler.pipelinedShuffleInfoEventsMap.isEmpty());
int valuesInMapLocations = scheduler.mapLocations.values().size();
assertTrue("Maplocations should have values. current size: " + valuesInMapLocations, valuesInMapLocations > 0);
// start scheduling for download
scheduler.getMapsForHost(scheduler.mapLocations.values().iterator().next());
// Attempt #0 comes up. When processing this, it should report exception
attemptNum = 0;
inputIdx = 1;
Event dme2 = createDataMovementEvent(attemptNum, inputIdx, null, false, true, true, 0, attemptNum);
handler.handleEvents(Collections.singletonList(dme2));
// task should issue kill request
verify(scheduler, times(1)).killSelf(any(IOException.class), any(String.class));
}
Aggregations