Search in sources :

Example 6 with ControlledClock

use of org.apache.hadoop.yarn.util.ControlledClock in project hadoop by apache.

the class TestAMLivelinessMonitor method testResetTimer.

@Test(timeout = 10000)
public void testResetTimer() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    UserGroupInformation.setConfiguration(conf);
    conf.set(YarnConfiguration.RECOVERY_ENABLED, "true");
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, true);
    conf.setInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 6000);
    final ControlledClock clock = new ControlledClock();
    clock.setTime(0);
    MemoryRMStateStore memStore = new MemoryRMStateStore() {

        @Override
        public synchronized RMState loadState() throws Exception {
            clock.setTime(8000);
            return super.loadState();
        }
    };
    memStore.init(conf);
    final ApplicationAttemptId attemptId = mock(ApplicationAttemptId.class);
    final Dispatcher dispatcher = mock(Dispatcher.class);
    final boolean[] expired = new boolean[] { false };
    final AMLivelinessMonitor monitor = new AMLivelinessMonitor(dispatcher, clock) {

        @Override
        protected void expire(ApplicationAttemptId id) {
            Assert.assertEquals(id, attemptId);
            expired[0] = true;
        }
    };
    monitor.register(attemptId);
    MockRM rm = new MockRM(conf, memStore) {

        @Override
        protected AMLivelinessMonitor createAMLivelinessMonitor() {
            return monitor;
        }
    };
    rm.start();
    // make sure that monitor has started
    while (monitor.getServiceState() != Service.STATE.STARTED) {
        Thread.sleep(100);
    }
    // expired[0] would be set to true without resetTimer
    Assert.assertFalse(expired[0]);
    rm.stop();
}
Also used : MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) Dispatcher(org.apache.hadoop.yarn.event.Dispatcher) ControlledClock(org.apache.hadoop.yarn.util.ControlledClock) Test(org.junit.Test)

Example 7 with ControlledClock

use of org.apache.hadoop.yarn.util.ControlledClock in project hadoop by apache.

the class TestMaxRunningAppsEnforcer method setup.

@Before
public void setup() throws Exception {
    Configuration conf = new Configuration();
    clock = new ControlledClock();
    scheduler = mock(FairScheduler.class);
    when(scheduler.getConf()).thenReturn(new FairSchedulerConfiguration(conf));
    when(scheduler.getClock()).thenReturn(clock);
    AllocationConfiguration allocConf = new AllocationConfiguration(conf);
    when(scheduler.getAllocationConfiguration()).thenReturn(allocConf);
    queueManager = new QueueManager(scheduler);
    queueManager.initialize(conf);
    userMaxApps = allocConf.userMaxApps;
    maxAppsEnforcer = new MaxRunningAppsEnforcer(scheduler);
    appNum = 0;
    rmContext = mock(RMContext.class);
    when(rmContext.getEpoch()).thenReturn(0L);
}
Also used : RMContext(org.apache.hadoop.yarn.server.resourcemanager.RMContext) Configuration(org.apache.hadoop.conf.Configuration) ControlledClock(org.apache.hadoop.yarn.util.ControlledClock) Before(org.junit.Before)

Example 8 with ControlledClock

use of org.apache.hadoop.yarn.util.ControlledClock in project hadoop by apache.

the class TestAMRestart method testRMAppAttemptFailuresValidityInterval.

@Test(timeout = 120000)
public void testRMAppAttemptFailuresValidityInterval() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
    conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    // explicitly set max-am-retry count as 2.
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    MockRM rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
    nm1.registerNode();
    // set window size to a larger number : 60s
    // we will verify the app should be failed if
    // two continuous attempts failed in 60s.
    RMApp app = rm1.submitApp(200, 60000, false);
    MockAM am = MockRM.launchAM(app, rm1, nm1);
    // Fail current attempt normally
    nm1.nodeHeartbeat(am.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // launch the second attempt
    rm1.waitForState(app.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(2, app.getAppAttempts().size());
    MockAM am_2 = MockRM.launchAndRegisterAM(app, rm1, nm1);
    rm1.waitForState(am_2.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    nm1.nodeHeartbeat(am_2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am_2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // current app should be failed.
    rm1.waitForState(app.getApplicationId(), RMAppState.FAILED);
    ControlledClock clock = new ControlledClock();
    // set window size to 10s
    RMAppImpl app1 = (RMAppImpl) rm1.submitApp(200, 10000, false);
    app1.setSystemClock(clock);
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    // Fail attempt1 normally
    nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    //Wait to make sure attempt1 be removed in State Store
    //TODO explore a better way than sleeping for a while (YARN-4929)
    Thread.sleep(15 * 1000);
    // launch the second attempt
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(2, app1.getAppAttempts().size());
    RMAppAttempt attempt2 = app1.getCurrentAppAttempt();
    MockAM am2 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    rm1.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    // wait for 10 seconds
    clock.setTime(System.currentTimeMillis() + 10 * 1000);
    // Fail attempt2 normally
    nm1.nodeHeartbeat(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // can launch the third attempt successfully
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(3, app1.getAppAttempts().size());
    RMAppAttempt attempt3 = app1.getCurrentAppAttempt();
    clock.reset();
    MockAM am3 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    rm1.waitForState(am3.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    // Restart rm.
    @SuppressWarnings("resource") MockRM rm2 = new MockRM(conf, memStore);
    rm2.start();
    ApplicationStateData app1State = memStore.getState().getApplicationState().get(app1.getApplicationId());
    Assert.assertEquals(1, app1State.getFirstAttemptId());
    // re-register the NM
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    NMContainerStatus status = Records.newRecord(NMContainerStatus.class);
    status.setContainerExitStatus(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER);
    status.setContainerId(attempt3.getMasterContainer().getId());
    status.setContainerState(ContainerState.COMPLETE);
    status.setDiagnostics("");
    nm1.registerNode(Collections.singletonList(status), null);
    rm2.waitForState(attempt3.getAppAttemptId(), RMAppAttemptState.FAILED);
    //Wait to make sure attempt3 be removed in State Store
    //TODO explore a better way than sleeping for a while (YARN-4929)
    Thread.sleep(15 * 1000);
    Assert.assertEquals(2, app1State.getAttemptCount());
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    // Lauch Attempt 4
    MockAM am4 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 4, nm1);
    // wait for 10 seconds
    clock.setTime(System.currentTimeMillis() + 10 * 1000);
    // Fail attempt4 normally
    nm1.nodeHeartbeat(am4.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm2.waitForState(am4.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    Assert.assertEquals(2, app1State.getAttemptCount());
    // can launch the 5th attempt successfully
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    MockAM am5 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 5, nm1);
    clock.reset();
    rm2.waitForState(am5.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    // Fail attempt5 normally
    nm1.nodeHeartbeat(am5.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm2.waitForState(am5.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    Assert.assertEquals(2, app1State.getAttemptCount());
    rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
    rm1.stop();
    rm2.stop();
}
Also used : RMAppImpl(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ControlledClock(org.apache.hadoop.yarn.util.ControlledClock) Test(org.junit.Test)

Example 9 with ControlledClock

use of org.apache.hadoop.yarn.util.ControlledClock in project hadoop by apache.

the class TestHistoryFileManager method testCreateDirsWithFileSystemNotBecomingAvailBeforeTimeout.

@Test(expected = YarnRuntimeException.class)
public void testCreateDirsWithFileSystemNotBecomingAvailBeforeTimeout() throws Exception {
    dfsCluster.getFileSystem().setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_ENTER);
    Assert.assertTrue(dfsCluster.getFileSystem().isInSafeMode());
    final ControlledClock clock = new ControlledClock();
    clock.setTime(1);
    new Thread() {

        @Override
        public void run() {
            try {
                Thread.sleep(500);
                clock.setTime(3000);
            } catch (Exception ex) {
                Assert.fail(ex.toString());
            }
        }
    }.start();
    testCreateHistoryDirs(dfsCluster.getConfiguration(0), clock);
}
Also used : ControlledClock(org.apache.hadoop.yarn.util.ControlledClock) FileNotFoundException(java.io.FileNotFoundException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) Test(org.junit.Test)

Example 10 with ControlledClock

use of org.apache.hadoop.yarn.util.ControlledClock in project hadoop by apache.

the class TestRuntimeEstimators method coreTestEstimator.

private void coreTestEstimator(TaskRuntimeEstimator testedEstimator, int expectedSpeculations) {
    estimator = testedEstimator;
    clock = new ControlledClock();
    dispatcher = new AsyncDispatcher();
    myJob = null;
    slotsInUse.set(0);
    completedMaps.set(0);
    completedReduces.set(0);
    successfulSpeculations.set(0);
    taskTimeSavedBySpeculation.set(0);
    clock.tickMsec(1000);
    Configuration conf = new Configuration();
    myAppContext = new MyAppContext(MAP_TASKS, REDUCE_TASKS);
    myJob = myAppContext.getAllJobs().values().iterator().next();
    estimator.contextualize(conf, myAppContext);
    conf.setLong(MRJobConfig.SPECULATIVE_RETRY_AFTER_NO_SPECULATE, 500L);
    conf.setLong(MRJobConfig.SPECULATIVE_RETRY_AFTER_SPECULATE, 5000L);
    conf.setDouble(MRJobConfig.SPECULATIVECAP_RUNNING_TASKS, 0.1);
    conf.setDouble(MRJobConfig.SPECULATIVECAP_TOTAL_TASKS, 0.001);
    conf.setInt(MRJobConfig.SPECULATIVE_MINIMUM_ALLOWED_TASKS, 5);
    speculator = new DefaultSpeculator(conf, myAppContext, estimator, clock);
    Assert.assertEquals("wrong SPECULATIVE_RETRY_AFTER_NO_SPECULATE value", 500L, speculator.getSoonestRetryAfterNoSpeculate());
    Assert.assertEquals("wrong SPECULATIVE_RETRY_AFTER_SPECULATE value", 5000L, speculator.getSoonestRetryAfterSpeculate());
    Assert.assertEquals(speculator.getProportionRunningTasksSpeculatable(), 0.1, 0.00001);
    Assert.assertEquals(speculator.getProportionTotalTasksSpeculatable(), 0.001, 0.00001);
    Assert.assertEquals("wrong SPECULATIVE_MINIMUM_ALLOWED_TASKS value", 5, speculator.getMinimumAllowedSpeculativeTasks());
    dispatcher.register(Speculator.EventType.class, speculator);
    dispatcher.register(TaskEventType.class, new SpeculationRequestEventHandler());
    dispatcher.init(conf);
    dispatcher.start();
    speculator.init(conf);
    speculator.start();
    // Now that the plumbing is hooked up, we do the following:
    //  do until all tasks are finished, ...
    //  1: If we have spare capacity, assign as many map tasks as we can, then
    //     assign as many reduce tasks as we can.  Note that an odd reduce
    //     task might be started while there are still map tasks, because
    //     map tasks take 3 slots and reduce tasks 2 slots.
    //  2: Send a speculation event for every task attempt that's running
    //  note that new attempts might get started by the speculator
    // discover undone tasks
    int undoneMaps = MAP_TASKS;
    int undoneReduces = REDUCE_TASKS;
    // build a task sequence where all the maps precede any of the reduces
    List<Task> allTasksSequence = new LinkedList<Task>();
    allTasksSequence.addAll(myJob.getTasks(TaskType.MAP).values());
    allTasksSequence.addAll(myJob.getTasks(TaskType.REDUCE).values());
    while (undoneMaps + undoneReduces > 0) {
        undoneMaps = 0;
        undoneReduces = 0;
        // start all attempts which are new but for which there is enough slots
        for (Task task : allTasksSequence) {
            if (!task.isFinished()) {
                if (task.getType() == TaskType.MAP) {
                    ++undoneMaps;
                } else {
                    ++undoneReduces;
                }
            }
            for (TaskAttempt attempt : task.getAttempts().values()) {
                if (attempt.getState() == TaskAttemptState.NEW && INITIAL_NUMBER_FREE_SLOTS - slotsInUse.get() >= taskTypeSlots(task.getType())) {
                    MyTaskAttemptImpl attemptImpl = (MyTaskAttemptImpl) attempt;
                    SpeculatorEvent event = new SpeculatorEvent(attempt.getID(), false, clock.getTime());
                    speculator.handle(event);
                    attemptImpl.startUp();
                } else {
                    // If a task attempt is in progress we should send the news to
                    // the Speculator.
                    TaskAttemptStatus status = new TaskAttemptStatus();
                    status.id = attempt.getID();
                    status.progress = attempt.getProgress();
                    status.stateString = attempt.getState().name();
                    status.taskState = attempt.getState();
                    SpeculatorEvent event = new SpeculatorEvent(status, clock.getTime());
                    speculator.handle(event);
                }
            }
        }
        long startTime = System.currentTimeMillis();
        // drain the speculator event queue
        while (!speculator.eventQueueEmpty()) {
            Thread.yield();
            if (System.currentTimeMillis() > startTime + 130000) {
                return;
            }
        }
        clock.tickMsec(1000L);
        if (clock.getTime() % 10000L == 0L) {
            speculator.scanForSpeculations();
        }
    }
    Assert.assertEquals("We got the wrong number of successful speculations.", expectedSpeculations, successfulSpeculations.get());
}
Also used : Task(org.apache.hadoop.mapreduce.v2.app.job.Task) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptStatus(org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptStatusUpdateEvent.TaskAttemptStatus) ControlledClock(org.apache.hadoop.yarn.util.ControlledClock) Speculator(org.apache.hadoop.mapreduce.v2.app.speculate.Speculator) DefaultSpeculator(org.apache.hadoop.mapreduce.v2.app.speculate.DefaultSpeculator) LinkedList(java.util.LinkedList) AsyncDispatcher(org.apache.hadoop.yarn.event.AsyncDispatcher) DefaultSpeculator(org.apache.hadoop.mapreduce.v2.app.speculate.DefaultSpeculator) SpeculatorEvent(org.apache.hadoop.mapreduce.v2.app.speculate.SpeculatorEvent) TaskAttempt(org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt)

Aggregations

ControlledClock (org.apache.hadoop.yarn.util.ControlledClock)22 Test (org.junit.Test)18 Configuration (org.apache.hadoop.conf.Configuration)11 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)11 Job (org.apache.hadoop.mapreduce.v2.app.job.Job)7 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)7 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)6 TaskAttemptId (org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId)5 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)5 JobId (org.apache.hadoop.mapreduce.v2.api.records.JobId)4 Task (org.apache.hadoop.mapreduce.v2.app.job.Task)4 TaskAttempt (org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt)4 TaskAttemptEvent (org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent)4 DrainDispatcher (org.apache.hadoop.yarn.event.DrainDispatcher)4 FileWriter (java.io.FileWriter)3 PrintWriter (java.io.PrintWriter)3 TaskId (org.apache.hadoop.mapreduce.v2.api.records.TaskId)3 MRApp (org.apache.hadoop.mapreduce.v2.app.MRApp)3 TaskAttemptStatus (org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptStatusUpdateEvent.TaskAttemptStatus)3 Container (org.apache.hadoop.yarn.api.records.Container)3