Search in sources :

Example 1 with JobStats

use of org.apache.hadoop.mapred.gridmix.Statistics.JobStats in project hadoop by apache.

the class StressJobFactory method checkLoadAndGetSlotsToBackfill.

/**
   * We try to use some light-weight mechanism to determine cluster load.
   *
   * @throws java.io.IOException
   */
protected void checkLoadAndGetSlotsToBackfill() throws IOException, InterruptedException {
    if (loadStatus.getJobLoad() <= 0) {
        if (LOG.isDebugEnabled()) {
            LOG.debug(System.currentTimeMillis() + " [JobLoad] Overloaded is " + Boolean.TRUE.toString() + " NumJobsBackfill is " + loadStatus.getJobLoad());
        }
        // stop calculation because we know it is overloaded.
        return;
    }
    int mapCapacity = loadStatus.getMapCapacity();
    int reduceCapacity = loadStatus.getReduceCapacity();
    // return if the cluster status is not set
    if (mapCapacity < 0 || reduceCapacity < 0) {
        // missing cluster status will result into blocking of job submission
        return;
    }
    // Determine the max permissible map & reduce task load
    int maxMapLoad = (int) (overloadMapTaskMapSlotRatio * mapCapacity);
    int maxReduceLoad = (int) (overloadReduceTaskReduceSlotRatio * reduceCapacity);
    // compute the total number of map & reduce tasks submitted
    int totalMapTasks = ClusterStats.getSubmittedMapTasks();
    int totalReduceTasks = ClusterStats.getSubmittedReduceTasks();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total submitted map tasks: " + totalMapTasks);
        LOG.debug("Total submitted reduce tasks: " + totalReduceTasks);
        LOG.debug("Max map load: " + maxMapLoad);
        LOG.debug("Max reduce load: " + maxReduceLoad);
    }
    // generate a pessimistic bound on the max running+pending map tasks
    // this check is to avoid the heavy-duty actual map load calculation
    int mapSlotsBackFill = (int) (maxMapLoad - totalMapTasks);
    // generate a pessimistic bound on the max running+pending reduce tasks
    // this check is to avoid the heavy-duty actual reduce load calculation
    int reduceSlotsBackFill = (int) (maxReduceLoad - totalReduceTasks);
    // maintain a list of seen job ids
    Set<JobID> seenJobIDs = new HashSet<JobID>();
    // permissible limit
    if (totalMapTasks > maxMapLoad || totalReduceTasks > maxReduceLoad) {
        // if yes, calculate the real load
        // include pending & running map tasks.
        float incompleteMapTasks = 0;
        // include pending & running reduce tasks
        float incompleteReduceTasks = 0;
        for (JobStats job : ClusterStats.getRunningJobStats()) {
            JobID id = job.getJob().getJobID();
            seenJobIDs.add(id);
            // should be smart enough to take care of completed jobs.
            if (blacklistedJobs.contains(id)) {
                LOG.warn("Ignoring blacklisted job: " + id);
                continue;
            }
            int noOfMaps = job.getNoOfMaps();
            int noOfReduces = job.getNoOfReds();
            //       What otherwise?
            if (noOfMaps > 0 || noOfReduces > 0) {
                // get the job's status
                JobStatus status = job.getJobStatus();
                // blacklist completed jobs and continue
                if (status != null && status.isJobComplete()) {
                    LOG.warn("Blacklisting completed job: " + id);
                    blacklistedJobs.add(id);
                    continue;
                }
                // get the map and reduce tasks' progress
                float mapProgress = 0f;
                float reduceProgress = 0f;
                // check if the status is missing (this can happen for unpolled jobs)
                if (status != null) {
                    mapProgress = status.getMapProgress();
                    reduceProgress = status.getReduceProgress();
                }
                incompleteMapTasks += calcEffectiveIncompleteMapTasks(mapCapacity, noOfMaps, mapProgress);
                // bail out early
                int currentMapSlotsBackFill = (int) (maxMapLoad - incompleteMapTasks);
                if (currentMapSlotsBackFill <= 0) {
                    // reset the reduce task load since we are bailing out
                    incompleteReduceTasks = totalReduceTasks;
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Terminating overload check due to high map load.");
                    }
                    break;
                }
                // compute the real reduce load
                if (noOfReduces > 0) {
                    incompleteReduceTasks += calcEffectiveIncompleteReduceTasks(reduceCapacity, noOfReduces, reduceProgress);
                }
                // bail out early
                int currentReduceSlotsBackFill = (int) (maxReduceLoad - incompleteReduceTasks);
                if (currentReduceSlotsBackFill <= 0) {
                    // reset the map task load since we are bailing out
                    incompleteMapTasks = totalMapTasks;
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Terminating overload check due to high reduce load.");
                    }
                    break;
                }
            } else {
                LOG.warn("Blacklisting empty job: " + id);
                blacklistedJobs.add(id);
            }
        }
        // calculate the real map load on the cluster
        mapSlotsBackFill = (int) (maxMapLoad - incompleteMapTasks);
        // calculate the real reduce load on the cluster
        reduceSlotsBackFill = (int) (maxReduceLoad - incompleteReduceTasks);
        // clean up the backlisted set to keep the memory footprint minimal
        // retain only the jobs that are seen in this cycle
        blacklistedJobs.retainAll(seenJobIDs);
        if (LOG.isDebugEnabled() && blacklistedJobs.size() > 0) {
            LOG.debug("Blacklisted jobs count: " + blacklistedJobs.size());
        }
    }
    // update
    loadStatus.updateMapLoad(mapSlotsBackFill);
    loadStatus.updateReduceLoad(reduceSlotsBackFill);
    if (loadStatus.getMapLoad() <= 0) {
        if (LOG.isDebugEnabled()) {
            LOG.debug(System.currentTimeMillis() + " [MAP-LOAD] Overloaded is " + Boolean.TRUE.toString() + " MapSlotsBackfill is " + loadStatus.getMapLoad());
        }
        // stop calculation because we know it is overloaded.
        return;
    }
    if (loadStatus.getReduceLoad() <= 0) {
        if (LOG.isDebugEnabled()) {
            LOG.debug(System.currentTimeMillis() + " [REDUCE-LOAD] Overloaded is " + Boolean.TRUE.toString() + " ReduceSlotsBackfill is " + loadStatus.getReduceLoad());
        }
        // stop calculation because we know it is overloaded.
        return;
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug(System.currentTimeMillis() + " [OVERALL] Overloaded is " + Boolean.FALSE.toString() + "Current load Status is " + loadStatus);
    }
}
Also used : JobStatus(org.apache.hadoop.mapreduce.JobStatus) JobID(org.apache.hadoop.mapreduce.JobID) HashSet(java.util.HashSet) JobStats(org.apache.hadoop.mapred.gridmix.Statistics.JobStats)

Example 2 with JobStats

use of org.apache.hadoop.mapred.gridmix.Statistics.JobStats in project hadoop by apache.

the class TestGridmixSummary method testExecutionSummarizer.

/**
   * Test {@link ExecutionSummarizer}.
   */
@Test
@SuppressWarnings({ "unchecked", "rawtypes" })
public void testExecutionSummarizer() throws IOException {
    Configuration conf = new Configuration();
    ExecutionSummarizer es = new ExecutionSummarizer();
    assertEquals("ExecutionSummarizer init failed", Summarizer.NA, es.getCommandLineArgsString());
    long startTime = System.currentTimeMillis();
    // test configuration parameters
    String[] initArgs = new String[] { "-Xmx20m", "-Dtest.args='test'" };
    es = new ExecutionSummarizer(initArgs);
    assertEquals("ExecutionSummarizer init failed", "-Xmx20m -Dtest.args='test'", es.getCommandLineArgsString());
    // test start time
    assertTrue("Start time mismatch", es.getStartTime() >= startTime);
    assertTrue("Start time mismatch", es.getStartTime() <= System.currentTimeMillis());
    // test start() of ExecutionSummarizer
    es.update(null);
    assertEquals("ExecutionSummarizer init failed", 0, es.getSimulationStartTime());
    testExecutionSummarizer(0, 0, 0, 0, 0, 0, 0, es);
    long simStartTime = System.currentTimeMillis();
    es.start(null);
    assertTrue("Simulation start time mismatch", es.getSimulationStartTime() >= simStartTime);
    assertTrue("Simulation start time mismatch", es.getSimulationStartTime() <= System.currentTimeMillis());
    // test with job stats
    JobStats stats = generateFakeJobStats(1, 10, true, false);
    es.update(stats);
    testExecutionSummarizer(1, 10, 0, 1, 1, 0, 0, es);
    // test with failed job 
    stats = generateFakeJobStats(5, 1, false, false);
    es.update(stats);
    testExecutionSummarizer(6, 11, 0, 2, 1, 1, 0, es);
    // test with successful but lost job 
    stats = generateFakeJobStats(1, 1, true, true);
    es.update(stats);
    testExecutionSummarizer(7, 12, 0, 3, 1, 1, 1, es);
    // test with failed but lost job 
    stats = generateFakeJobStats(2, 2, false, true);
    es.update(stats);
    testExecutionSummarizer(9, 14, 0, 4, 1, 1, 2, es);
    // test finalize
    //  define a fake job factory
    JobFactory factory = new FakeJobFactory(conf);
    // fake the num jobs in trace
    factory.numJobsInTrace = 3;
    Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp"));
    Path testDir = new Path(rootTempDir, "testGridmixSummary");
    Path testTraceFile = new Path(testDir, "test-trace.json");
    FileSystem fs = FileSystem.getLocal(conf);
    fs.create(testTraceFile).close();
    // finalize the summarizer
    UserResolver resolver = new RoundRobinUserResolver();
    DataStatistics dataStats = new DataStatistics(100, 2, true);
    String policy = GridmixJobSubmissionPolicy.REPLAY.name();
    conf.set(GridmixJobSubmissionPolicy.JOB_SUBMISSION_POLICY, policy);
    es.finalize(factory, testTraceFile.toString(), 1024L, resolver, dataStats, conf);
    // test num jobs in trace
    assertEquals("Mismtach in num jobs in trace", 3, es.getNumJobsInTrace());
    // test trace signature
    String tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
    assertEquals("Mismatch in trace signature", tid, es.getInputTraceSignature());
    // test trace location
    Path qPath = fs.makeQualified(testTraceFile);
    assertEquals("Mismatch in trace filename", qPath.toString(), es.getInputTraceLocation());
    // test expected data size
    assertEquals("Mismatch in expected data size", "1 K", es.getExpectedDataSize());
    // test input data statistics
    assertEquals("Mismatch in input data statistics", ExecutionSummarizer.stringifyDataStatistics(dataStats), es.getInputDataStatistics());
    // test user resolver
    assertEquals("Mismatch in user resolver", resolver.getClass().getName(), es.getUserResolver());
    // test policy
    assertEquals("Mismatch in policy", policy, es.getJobSubmissionPolicy());
    // test data stringification using large data
    es.finalize(factory, testTraceFile.toString(), 1024 * 1024 * 1024 * 10L, resolver, dataStats, conf);
    assertEquals("Mismatch in expected data size", "10 G", es.getExpectedDataSize());
    // test trace signature uniqueness
    //  touch the trace file
    fs.delete(testTraceFile, false);
    //  sleep for 1 sec
    try {
        Thread.sleep(1000);
    } catch (InterruptedException ie) {
    }
    fs.create(testTraceFile).close();
    es.finalize(factory, testTraceFile.toString(), 0L, resolver, dataStats, conf);
    // test missing expected data size
    assertEquals("Mismatch in trace data size", Summarizer.NA, es.getExpectedDataSize());
    assertFalse("Mismatch in trace signature", tid.equals(es.getInputTraceSignature()));
    // get the new identifier
    tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
    assertEquals("Mismatch in trace signature", tid, es.getInputTraceSignature());
    testTraceFile = new Path(testDir, "test-trace2.json");
    fs.create(testTraceFile).close();
    es.finalize(factory, testTraceFile.toString(), 0L, resolver, dataStats, conf);
    assertFalse("Mismatch in trace signature", tid.equals(es.getInputTraceSignature()));
    // get the new identifier
    tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
    assertEquals("Mismatch in trace signature", tid, es.getInputTraceSignature());
    // finalize trace identifier '-' input
    es.finalize(factory, "-", 0L, resolver, dataStats, conf);
    assertEquals("Mismatch in trace signature", Summarizer.NA, es.getInputTraceSignature());
    assertEquals("Mismatch in trace file location", Summarizer.NA, es.getInputTraceLocation());
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) DataStatistics(org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics) JobStats(org.apache.hadoop.mapred.gridmix.Statistics.JobStats) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.junit.Test)

Aggregations

JobStats (org.apache.hadoop.mapred.gridmix.Statistics.JobStats)2 HashSet (java.util.HashSet)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 DataStatistics (org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics)1 JobID (org.apache.hadoop.mapreduce.JobID)1 JobStatus (org.apache.hadoop.mapreduce.JobStatus)1 Test (org.junit.Test)1