Search in sources :

Example 21 with ScheduledExecutorService

use of java.util.concurrent.ScheduledExecutorService in project flink by apache.

the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.

/**
	 * This test tests that the restarting time metric correctly displays restarting times.
	 */
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
    final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
    try {
        // setup execution graph with mocked scheduling logic
        int parallelism = 1;
        JobVertex jobVertex = new JobVertex("TestVertex");
        jobVertex.setParallelism(parallelism);
        jobVertex.setInvokableClass(NoOpInvokable.class);
        JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
        Configuration config = new Configuration();
        config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
        Configuration jobConfig = new Configuration();
        Time timeout = Time.seconds(10L);
        MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
        assertTrue(metricRegistry.getReporters().size() == 1);
        MetricReporter reporter = metricRegistry.getReporters().get(0);
        assertTrue(reporter instanceof TestingReporter);
        TestingReporter testingReporter = (TestingReporter) reporter;
        MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
        Scheduler scheduler = mock(Scheduler.class);
        ResourceID taskManagerId = ResourceID.generate();
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
        when(taskManagerLocation.getHostname()).thenReturn("localhost");
        TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
        Instance instance = mock(Instance.class);
        when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(instance.getTaskManagerID()).thenReturn(taskManagerId);
        when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        Slot rootSlot = mock(Slot.class);
        AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
        when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
        SimpleSlot simpleSlot = mock(SimpleSlot.class);
        when(simpleSlot.isAlive()).thenReturn(true);
        when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
        when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
        when(simpleSlot.getRoot()).thenReturn(rootSlot);
        when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
        FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
        future.complete(simpleSlot);
        when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
        when(rootSlot.getSlotNumber()).thenReturn(0);
        when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
        TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
        ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
        // get restarting time metric
        Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
        assertNotNull(metric);
        assertTrue(metric instanceof Gauge);
        @SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
        // check that the restarting time is 0 since it's the initial start
        assertTrue(0L == restartingTime.getValue());
        executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
        // start execution
        executionGraph.scheduleForExecution();
        assertTrue(0L == restartingTime.getValue());
        List<ExecutionAttemptID> executionIDs = new ArrayList<>();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        // tell execution graph that the tasks are in state running --> job status switches to state running
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(0L == restartingTime.getValue());
        // fail the job so that it goes into state restarting
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        // wait some time so that the restarting time gauge shows a value different from 0
        Thread.sleep(50);
        long previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is monotonically increasing
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // check that we have measured some restarting time
        assertTrue(previousRestartingTime > 0);
        // restart job
        testingRestartStrategy.restartExecutionGraph();
        executionIDs.clear();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(firstRestartingTimestamp != 0);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time does not increase after we've reached the running state
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // fail job again
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
        Thread.sleep(50);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is increasing again
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        assertTrue(previousRestartingTime > 0);
        // now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
        // for this to work, we have to use a SuppressRestartException
        executionGraph.fail(new SuppressRestartsException(new Exception()));
        assertEquals(JobStatus.FAILED, executionGraph.getState());
        previousRestartingTime = restartingTime.getValue();
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
    } finally {
        executor.shutdownNow();
    }
}
Also used : JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) Instance(org.apache.flink.runtime.instance.Instance) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) MetricGroup(org.apache.flink.metrics.MetricGroup) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) ArrayList(java.util.ArrayList) Time(org.apache.flink.api.common.time.Time) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) Gauge(org.apache.flink.metrics.Gauge) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ScheduledUnit(org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit) MetricReporter(org.apache.flink.metrics.reporter.MetricReporter) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) JobException(org.apache.flink.runtime.JobException) IOException(java.io.IOException) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) Slot(org.apache.flink.runtime.instance.Slot) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) Metric(org.apache.flink.metrics.Metric) Test(org.junit.Test)

Example 22 with ScheduledExecutorService

use of java.util.concurrent.ScheduledExecutorService in project flink by apache.

the class StackTraceSampleCoordinatorTest method testTriggerStackTraceSampleTimeout.

/** Tests that samples time out if they don't finish in time. */
@Test(timeout = 1000L)
public void testTriggerStackTraceSampleTimeout() throws Exception {
    int timeout = 100;
    coord = new StackTraceSampleCoordinator(system.dispatcher(), timeout);
    final ScheduledExecutorService scheduledExecutorService = new ScheduledThreadPoolExecutor(1);
    try {
        ExecutionVertex[] vertices = new ExecutionVertex[] { mockExecutionVertexWithTimeout(new ExecutionAttemptID(), ExecutionState.RUNNING, scheduledExecutorService, timeout) };
        Future<StackTraceSample> sampleFuture = coord.triggerStackTraceSample(vertices, 1, Time.milliseconds(100L), 0);
        // Wait for the timeout
        Thread.sleep(timeout * 2);
        boolean success = false;
        for (int i = 0; i < 10; i++) {
            if (sampleFuture.isDone()) {
                success = true;
                break;
            }
            Thread.sleep(timeout);
        }
        assertTrue("Sample did not time out", success);
        try {
            sampleFuture.get();
            fail("Expected exception.");
        } catch (ExecutionException e) {
            assertTrue(e.getCause().getCause().getMessage().contains("Timeout"));
        }
        // Collect after the timeout (should be ignored)
        ExecutionAttemptID executionId = vertices[0].getCurrentExecutionAttempt().getAttemptId();
        coord.collectStackTraces(0, executionId, new ArrayList<StackTraceElement[]>());
    } finally {
        scheduledExecutorService.shutdownNow();
    }
}
Also used : ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) TriggerStackTraceSample(org.apache.flink.runtime.messages.StackTraceSampleMessages.TriggerStackTraceSample) ExecutionException(java.util.concurrent.ExecutionException) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Test(org.junit.Test)

Example 23 with ScheduledExecutorService

use of java.util.concurrent.ScheduledExecutorService in project hadoop by apache.

the class YarnChild method main.

public static void main(String[] args) throws Throwable {
    Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
    LOG.debug("Child starting");
    final JobConf job = new JobConf(MRJobConfig.JOB_CONF_FILE);
    // Initing with our JobConf allows us to avoid loading confs twice
    Limits.init(job);
    UserGroupInformation.setConfiguration(job);
    // MAPREDUCE-6565: need to set configuration for SecurityUtil.
    SecurityUtil.setConfiguration(job);
    String host = args[0];
    int port = Integer.parseInt(args[1]);
    final InetSocketAddress address = NetUtils.createSocketAddrForHost(host, port);
    final TaskAttemptID firstTaskid = TaskAttemptID.forName(args[2]);
    long jvmIdLong = Long.parseLong(args[3]);
    JVMId jvmId = new JVMId(firstTaskid.getJobID(), firstTaskid.getTaskType() == TaskType.MAP, jvmIdLong);
    CallerContext.setCurrent(new CallerContext.Builder("mr_" + firstTaskid.toString()).build());
    // initialize metrics
    DefaultMetricsSystem.initialize(StringUtils.camelize(firstTaskid.getTaskType().name()) + "Task");
    // Security framework already loaded the tokens into current ugi
    Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
    LOG.info("Executing with tokens:");
    for (Token<?> token : credentials.getAllTokens()) {
        LOG.info(token);
    }
    // Create TaskUmbilicalProtocol as actual task owner.
    UserGroupInformation taskOwner = UserGroupInformation.createRemoteUser(firstTaskid.getJobID().toString());
    Token<JobTokenIdentifier> jt = TokenCache.getJobToken(credentials);
    SecurityUtil.setTokenService(jt, address);
    taskOwner.addToken(jt);
    final TaskUmbilicalProtocol umbilical = taskOwner.doAs(new PrivilegedExceptionAction<TaskUmbilicalProtocol>() {

        @Override
        public TaskUmbilicalProtocol run() throws Exception {
            return (TaskUmbilicalProtocol) RPC.getProxy(TaskUmbilicalProtocol.class, TaskUmbilicalProtocol.versionID, address, job);
        }
    });
    // report non-pid to application master
    JvmContext context = new JvmContext(jvmId, "-1000");
    LOG.debug("PID: " + System.getenv().get("JVM_PID"));
    Task task = null;
    UserGroupInformation childUGI = null;
    ScheduledExecutorService logSyncer = null;
    try {
        int idleLoopCount = 0;
        JvmTask myTask = null;
        ;
        // poll for new task
        for (int idle = 0; null == myTask; ++idle) {
            long sleepTimeMilliSecs = Math.min(idle * 500, 1500);
            LOG.info("Sleeping for " + sleepTimeMilliSecs + "ms before retrying again. Got null now.");
            MILLISECONDS.sleep(sleepTimeMilliSecs);
            myTask = umbilical.getTask(context);
        }
        if (myTask.shouldDie()) {
            return;
        }
        task = myTask.getTask();
        YarnChild.taskid = task.getTaskID();
        // Create the job-conf and set credentials
        configureTask(job, task, credentials, jt);
        // log the system properties
        String systemPropsToLog = MRApps.getSystemPropertiesToLog(job);
        if (systemPropsToLog != null) {
            LOG.info(systemPropsToLog);
        }
        // Initiate Java VM metrics
        JvmMetrics.initSingleton(jvmId.toString(), job.getSessionId());
        childUGI = UserGroupInformation.createRemoteUser(System.getenv(ApplicationConstants.Environment.USER.toString()));
        // Add tokens to new user so that it may execute its task correctly.
        childUGI.addCredentials(credentials);
        // set job classloader if configured before invoking the task
        MRApps.setJobClassLoader(job);
        logSyncer = TaskLog.createLogSyncer();
        // Create a final reference to the task for the doAs block
        final Task taskFinal = task;
        childUGI.doAs(new PrivilegedExceptionAction<Object>() {

            @Override
            public Object run() throws Exception {
                // use job-specified working directory
                setEncryptedSpillKeyIfRequired(taskFinal);
                FileSystem.get(job).setWorkingDirectory(job.getWorkingDirectory());
                // run the task
                taskFinal.run(job, umbilical);
                return null;
            }
        });
    } catch (FSError e) {
        LOG.fatal("FSError from child", e);
        if (!ShutdownHookManager.get().isShutdownInProgress()) {
            umbilical.fsError(taskid, e.getMessage());
        }
    } catch (Exception exception) {
        LOG.warn("Exception running child : " + StringUtils.stringifyException(exception));
        try {
            if (task != null) {
                // do cleanup for the task
                if (childUGI == null) {
                    // no need to job into doAs block
                    task.taskCleanup(umbilical);
                } else {
                    final Task taskFinal = task;
                    childUGI.doAs(new PrivilegedExceptionAction<Object>() {

                        @Override
                        public Object run() throws Exception {
                            taskFinal.taskCleanup(umbilical);
                            return null;
                        }
                    });
                }
            }
        } catch (Exception e) {
            LOG.info("Exception cleaning up: " + StringUtils.stringifyException(e));
        }
        // Report back any failures, for diagnostic purposes
        if (taskid != null) {
            if (!ShutdownHookManager.get().isShutdownInProgress()) {
                umbilical.fatalError(taskid, StringUtils.stringifyException(exception));
            }
        }
    } catch (Throwable throwable) {
        LOG.fatal("Error running child : " + StringUtils.stringifyException(throwable));
        if (taskid != null) {
            if (!ShutdownHookManager.get().isShutdownInProgress()) {
                Throwable tCause = throwable.getCause();
                String cause = tCause == null ? throwable.getMessage() : StringUtils.stringifyException(tCause);
                umbilical.fatalError(taskid, cause);
            }
        }
    } finally {
        RPC.stopProxy(umbilical);
        DefaultMetricsSystem.shutdown();
        TaskLog.syncLogsShutdown(logSyncer);
    }
}
Also used : YarnUncaughtExceptionHandler(org.apache.hadoop.yarn.YarnUncaughtExceptionHandler) InetSocketAddress(java.net.InetSocketAddress) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) FSError(org.apache.hadoop.fs.FSError) JobTokenIdentifier(org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier) PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) IOException(java.io.IOException) DiskErrorException(org.apache.hadoop.util.DiskChecker.DiskErrorException) Credentials(org.apache.hadoop.security.Credentials)

Example 24 with ScheduledExecutorService

use of java.util.concurrent.ScheduledExecutorService in project hadoop by apache.

the class TestDFSInotifyEventInputStream method testReadEventsWithTimeout.

@Test(timeout = 120000)
public void testReadEventsWithTimeout() throws IOException, InterruptedException, MissingEventsException {
    Configuration conf = new HdfsConfiguration();
    MiniQJMHACluster cluster = new MiniQJMHACluster.Builder(conf).build();
    try {
        cluster.getDfsCluster().waitActive();
        cluster.getDfsCluster().transitionToActive(0);
        final DFSClient client = new DFSClient(cluster.getDfsCluster().getNameNode(0).getNameNodeAddress(), conf);
        DFSInotifyEventInputStream eis = client.getInotifyEventStream();
        ScheduledExecutorService ex = Executors.newSingleThreadScheduledExecutor();
        ex.schedule(new Runnable() {

            @Override
            public void run() {
                try {
                    client.mkdirs("/dir", null, false);
                } catch (IOException e) {
                    // test will fail
                    LOG.error("Unable to create /dir", e);
                }
            }
        }, 1, TimeUnit.SECONDS);
        // a very generous wait period -- the edit will definitely have been
        // processed by the time this is up
        EventBatch batch = eis.poll(5, TimeUnit.SECONDS);
        Assert.assertNotNull(batch);
        Assert.assertEquals(1, batch.getEvents().length);
        Assert.assertTrue(batch.getEvents()[0].getEventType() == Event.EventType.CREATE);
        Assert.assertEquals("/dir", ((Event.CreateEvent) batch.getEvents()[0]).getPath());
    } finally {
        cluster.shutdown();
    }
}
Also used : ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) Configuration(org.apache.hadoop.conf.Configuration) Event(org.apache.hadoop.hdfs.inotify.Event) MiniQJMHACluster(org.apache.hadoop.hdfs.qjournal.MiniQJMHACluster) IOException(java.io.IOException) EventBatch(org.apache.hadoop.hdfs.inotify.EventBatch) Test(org.junit.Test)

Example 25 with ScheduledExecutorService

use of java.util.concurrent.ScheduledExecutorService in project hadoop by apache.

the class TestDirectoryScanner method testThrottling.

/**
   * Test that the timeslice throttle limits the report compiler thread's
   * execution time correctly.  We test by scanning a large block pool and
   * comparing the time spent waiting to the time spent running.
   *
   * The block pool has to be large, or the ratio will be off.  The throttle
   * allows the report compiler thread to finish its current cycle when
   * blocking it, so the ratio will always be a little lower than expected.
   * The smaller the block pool, the further off the ratio will be.
   *
   * @throws Exception thrown on unexpected failure
   */
@Test(timeout = 600000)
public void testThrottling() throws Exception {
    Configuration conf = new Configuration(CONF);
    // We need lots of blocks so the report compiler threads have enough to
    // keep them busy while we watch them.
    int blocks = 20000;
    int maxRetries = 3;
    cluster = new MiniDFSCluster.Builder(conf).build();
    try {
        cluster.waitActive();
        bpid = cluster.getNamesystem().getBlockPoolId();
        fds = DataNodeTestUtils.getFSDataset(cluster.getDataNodes().get(0));
        client = cluster.getFileSystem().getClient();
        conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_THROTTLE_LIMIT_MS_PER_SEC_KEY, 100);
        DataNode dataNode = cluster.getDataNodes().get(0);
        final int maxBlocksPerFile = (int) DFSConfigKeys.DFS_NAMENODE_MAX_BLOCKS_PER_FILE_DEFAULT;
        int numBlocksToCreate = blocks;
        while (numBlocksToCreate > 0) {
            final int toCreate = Math.min(maxBlocksPerFile, numBlocksToCreate);
            createFile(GenericTestUtils.getMethodName() + numBlocksToCreate, BLOCK_LENGTH * toCreate, false);
            numBlocksToCreate -= toCreate;
        }
        float ratio = 0.0f;
        int retries = maxRetries;
        while ((retries > 0) && ((ratio < 7f) || (ratio > 10f))) {
            scanner = new DirectoryScanner(dataNode, fds, conf);
            ratio = runThrottleTest(blocks);
            retries -= 1;
        }
        // Waiting should be about 9x running.
        LOG.info("RATIO: " + ratio);
        assertTrue("Throttle is too restrictive", ratio <= 10f);
        assertTrue("Throttle is too permissive", ratio >= 7f);
        // Test with a different limit
        conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_THROTTLE_LIMIT_MS_PER_SEC_KEY, 200);
        ratio = 0.0f;
        retries = maxRetries;
        while ((retries > 0) && ((ratio < 2.75f) || (ratio > 4.5f))) {
            scanner = new DirectoryScanner(dataNode, fds, conf);
            ratio = runThrottleTest(blocks);
            retries -= 1;
        }
        // Waiting should be about 4x running.
        LOG.info("RATIO: " + ratio);
        assertTrue("Throttle is too restrictive", ratio <= 4.5f);
        assertTrue("Throttle is too permissive", ratio >= 2.75f);
        // Test with more than 1 thread
        conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_THREADS_KEY, 3);
        conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_THROTTLE_LIMIT_MS_PER_SEC_KEY, 100);
        ratio = 0.0f;
        retries = maxRetries;
        while ((retries > 0) && ((ratio < 7f) || (ratio > 10f))) {
            scanner = new DirectoryScanner(dataNode, fds, conf);
            ratio = runThrottleTest(blocks);
            retries -= 1;
        }
        // Waiting should be about 9x running.
        LOG.info("RATIO: " + ratio);
        assertTrue("Throttle is too restrictive", ratio <= 10f);
        assertTrue("Throttle is too permissive", ratio >= 7f);
        // Test with no limit
        scanner = new DirectoryScanner(dataNode, fds, CONF);
        scanner.setRetainDiffs(true);
        scan(blocks, 0, 0, 0, 0, 0);
        scanner.shutdown();
        assertFalse(scanner.getRunStatus());
        assertTrue("Throttle appears to be engaged", scanner.timeWaitingMs.get() < 10L);
        assertTrue("Report complier threads logged no execution time", scanner.timeRunningMs.get() > 0L);
        // Test with a 1ms limit.  This also tests whether the scanner can be
        // shutdown cleanly in mid stride.
        conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_THROTTLE_LIMIT_MS_PER_SEC_KEY, 1);
        ratio = 0.0f;
        retries = maxRetries;
        ScheduledExecutorService interruptor = Executors.newScheduledThreadPool(maxRetries);
        try {
            while ((retries > 0) && (ratio < 10)) {
                scanner = new DirectoryScanner(dataNode, fds, conf);
                scanner.setRetainDiffs(true);
                final AtomicLong nowMs = new AtomicLong();
                // Stop the scanner after 2 seconds because otherwise it will take an
                // eternity to complete it's run
                interruptor.schedule(new Runnable() {

                    @Override
                    public void run() {
                        nowMs.set(Time.monotonicNow());
                        scanner.shutdown();
                    }
                }, 2L, TimeUnit.SECONDS);
                scanner.reconcile();
                assertFalse(scanner.getRunStatus());
                long finalMs = nowMs.get();
                // that the shutdown was timely
                if (finalMs > 0) {
                    LOG.info("Scanner took " + (Time.monotonicNow() - finalMs) + "ms to shutdown");
                    assertTrue("Scanner took too long to shutdown", Time.monotonicNow() - finalMs < 1000L);
                }
                ratio = (float) scanner.timeWaitingMs.get() / scanner.timeRunningMs.get();
                retries -= 1;
            }
        } finally {
            interruptor.shutdown();
        }
        // We just want to test that it waits a lot, but it also runs some
        LOG.info("RATIO: " + ratio);
        assertTrue("Throttle is too permissive", ratio > 10);
        assertTrue("Report complier threads logged no execution time", scanner.timeRunningMs.get() > 0L);
        // Test with a 0 limit, i.e. disabled
        conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_THROTTLE_LIMIT_MS_PER_SEC_KEY, 0);
        scanner = new DirectoryScanner(dataNode, fds, conf);
        scanner.setRetainDiffs(true);
        scan(blocks, 0, 0, 0, 0, 0);
        scanner.shutdown();
        assertFalse(scanner.getRunStatus());
        assertTrue("Throttle appears to be engaged", scanner.timeWaitingMs.get() < 10L);
        assertTrue("Report complier threads logged no execution time", scanner.timeRunningMs.get() > 0L);
        // Test with a 1000 limit, i.e. disabled
        conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_THROTTLE_LIMIT_MS_PER_SEC_KEY, 1000);
        scanner = new DirectoryScanner(dataNode, fds, conf);
        scanner.setRetainDiffs(true);
        scan(blocks, 0, 0, 0, 0, 0);
        scanner.shutdown();
        assertFalse(scanner.getRunStatus());
        assertTrue("Throttle appears to be engaged", scanner.timeWaitingMs.get() < 10L);
        assertTrue("Report complier threads logged no execution time", scanner.timeRunningMs.get() > 0L);
        // Test that throttle works from regular start
        conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_THREADS_KEY, 1);
        conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_THROTTLE_LIMIT_MS_PER_SEC_KEY, 10);
        conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_INTERVAL_KEY, 1);
        scanner = new DirectoryScanner(dataNode, fds, conf);
        scanner.setRetainDiffs(true);
        scanner.start();
        int count = 50;
        while ((count > 0) && (scanner.timeWaitingMs.get() < 500L)) {
            Thread.sleep(100L);
            count -= 1;
        }
        scanner.shutdown();
        assertFalse(scanner.getRunStatus());
        assertTrue("Throttle does not appear to be engaged", count > 0);
    } finally {
        cluster.shutdown();
    }
}
Also used : ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AtomicLong(java.util.concurrent.atomic.AtomicLong) Configuration(org.apache.hadoop.conf.Configuration) HdfsConfiguration(org.apache.hadoop.hdfs.HdfsConfiguration) Test(org.junit.Test)

Aggregations

ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)821 Test (org.junit.Test)267 CountDownLatch (java.util.concurrent.CountDownLatch)79 ArrayList (java.util.ArrayList)72 Test (org.testng.annotations.Test)72 IOException (java.io.IOException)71 ExecutorService (java.util.concurrent.ExecutorService)70 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)65 HashMap (java.util.HashMap)57 ScheduledThreadPoolExecutor (java.util.concurrent.ScheduledThreadPoolExecutor)53 List (java.util.List)51 Map (java.util.Map)51 TimeUnit (java.util.concurrent.TimeUnit)44 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)43 ThreadFactory (java.util.concurrent.ThreadFactory)40 CompletableFuture (java.util.concurrent.CompletableFuture)35 UUID (java.util.UUID)34 Cleanup (lombok.Cleanup)31 ExecutionException (java.util.concurrent.ExecutionException)30 HashSet (java.util.HashSet)25