Search in sources :

Example 6 with TaskAttemptID

use of org.apache.hadoop.mapreduce.TaskAttemptID in project hadoop by apache.

the class Fetcher method copyMapOutput.

private TaskAttemptID[] copyMapOutput(MapHost host, DataInputStream input, Set<TaskAttemptID> remaining, boolean canRetry) throws IOException {
    MapOutput<K, V> mapOutput = null;
    TaskAttemptID mapId = null;
    long decompressedLength = -1;
    long compressedLength = -1;
    try {
        long startTime = Time.monotonicNow();
        int forReduce = -1;
        //Read the shuffle header
        try {
            ShuffleHeader header = new ShuffleHeader();
            header.readFields(input);
            mapId = TaskAttemptID.forName(header.mapId);
            compressedLength = header.compressedLength;
            decompressedLength = header.uncompressedLength;
            forReduce = header.forReduce;
        } catch (IllegalArgumentException e) {
            badIdErrs.increment(1);
            LOG.warn("Invalid map id ", e);
            //Don't know which one was bad, so consider all of them as bad
            return remaining.toArray(new TaskAttemptID[remaining.size()]);
        }
        InputStream is = input;
        is = CryptoUtils.wrapIfNecessary(jobConf, is, compressedLength);
        compressedLength -= CryptoUtils.cryptoPadding(jobConf);
        decompressedLength -= CryptoUtils.cryptoPadding(jobConf);
        // Do some basic sanity verification
        if (!verifySanity(compressedLength, decompressedLength, forReduce, remaining, mapId)) {
            return new TaskAttemptID[] { mapId };
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("header: " + mapId + ", len: " + compressedLength + ", decomp len: " + decompressedLength);
        }
        // Get the location for the map output - either in-memory or on-disk
        try {
            mapOutput = merger.reserve(mapId, decompressedLength, id);
        } catch (IOException ioe) {
            // kill this reduce attempt
            ioErrs.increment(1);
            scheduler.reportLocalError(ioe);
            return EMPTY_ATTEMPT_ID_ARRAY;
        }
        // Check if we can shuffle *now* ...
        if (mapOutput == null) {
            LOG.info("fetcher#" + id + " - MergeManager returned status WAIT ...");
            //Not an error but wait to process data.
            return EMPTY_ATTEMPT_ID_ARRAY;
        }
        // to allow fetch failure logic to be processed
        try {
            // Go!
            LOG.info("fetcher#" + id + " about to shuffle output of map " + mapOutput.getMapId() + " decomp: " + decompressedLength + " len: " + compressedLength + " to " + mapOutput.getDescription());
            mapOutput.shuffle(host, is, compressedLength, decompressedLength, metrics, reporter);
        } catch (java.lang.InternalError | Exception e) {
            LOG.warn("Failed to shuffle for fetcher#" + id, e);
            throw new IOException(e);
        }
        // Inform the shuffle scheduler
        long endTime = Time.monotonicNow();
        // Reset retryStartTime as map task make progress if retried before.
        retryStartTime = 0;
        scheduler.copySucceeded(mapId, host, compressedLength, startTime, endTime, mapOutput);
        // Note successful shuffle
        remaining.remove(mapId);
        metrics.successFetch();
        return null;
    } catch (IOException ioe) {
        if (mapOutput != null) {
            mapOutput.abort();
        }
        if (canRetry) {
            checkTimeoutOrRetry(host, ioe);
        }
        ioErrs.increment(1);
        if (mapId == null || mapOutput == null) {
            LOG.warn("fetcher#" + id + " failed to read map header" + mapId + " decomp: " + decompressedLength + ", " + compressedLength, ioe);
            if (mapId == null) {
                return remaining.toArray(new TaskAttemptID[remaining.size()]);
            } else {
                return new TaskAttemptID[] { mapId };
            }
        }
        LOG.warn("Failed to shuffle output of " + mapId + " from " + host.getHostName(), ioe);
        // Inform the shuffle-scheduler
        metrics.failedFetch();
        return new TaskAttemptID[] { mapId };
    }
}
Also used : TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) DataInputStream(java.io.DataInputStream) InputStream(java.io.InputStream) IOException(java.io.IOException) GeneralSecurityException(java.security.GeneralSecurityException) ConnectException(java.net.ConnectException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException)

Example 7 with TaskAttemptID

use of org.apache.hadoop.mapreduce.TaskAttemptID in project hadoop by apache.

the class Fetcher method copyFromHost.

/**
   * The crux of the matter...
   * 
   * @param host {@link MapHost} from which we need to  
   *              shuffle available map-outputs.
   */
@VisibleForTesting
protected void copyFromHost(MapHost host) throws IOException {
    // reset retryStartTime for a new host
    retryStartTime = 0;
    // Get completed maps on 'host'
    List<TaskAttemptID> maps = scheduler.getMapsForHost(host);
    // especially at the tail of large jobs
    if (maps.size() == 0) {
        return;
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Fetcher " + id + " going to fetch from " + host + " for: " + maps);
    }
    // List of maps to be fetched yet
    Set<TaskAttemptID> remaining = new HashSet<TaskAttemptID>(maps);
    // Construct the url and connect
    URL url = getMapOutputURL(host, maps);
    DataInputStream input = openShuffleUrl(host, remaining, url);
    if (input == null) {
        return;
    }
    try {
        // Loop through available map-outputs and fetch them
        // On any error, faildTasks is not null and we exit
        // after putting back the remaining maps to the 
        // yet_to_be_fetched list and marking the failed tasks.
        TaskAttemptID[] failedTasks = null;
        while (!remaining.isEmpty() && failedTasks == null) {
            try {
                failedTasks = copyMapOutput(host, input, remaining, fetchRetryEnabled);
            } catch (IOException e) {
                IOUtils.cleanup(LOG, input);
                //
                // Setup connection again if disconnected by NM
                connection.disconnect();
                // Get map output from remaining tasks only.
                url = getMapOutputURL(host, remaining);
                input = openShuffleUrl(host, remaining, url);
                if (input == null) {
                    return;
                }
            }
        }
        if (failedTasks != null && failedTasks.length > 0) {
            LOG.warn("copyMapOutput failed for tasks " + Arrays.toString(failedTasks));
            scheduler.hostFailed(host.getHostName());
            for (TaskAttemptID left : failedTasks) {
                scheduler.copyFailed(left, host, true, false);
            }
        }
        // Sanity check
        if (failedTasks == null && !remaining.isEmpty()) {
            throw new IOException("server didn't return all expected map outputs: " + remaining.size() + " left.");
        }
        input.close();
        input = null;
    } finally {
        if (input != null) {
            IOUtils.cleanup(LOG, input);
            input = null;
        }
        for (TaskAttemptID left : remaining) {
            scheduler.putBackKnownMapOutput(host, left);
        }
    }
}
Also used : TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) IOException(java.io.IOException) DataInputStream(java.io.DataInputStream) URL(java.net.URL) HashSet(java.util.HashSet) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 8 with TaskAttemptID

use of org.apache.hadoop.mapreduce.TaskAttemptID in project hadoop by apache.

the class Fetcher method getMapOutputURL.

/**
   * Create the map-output-url. This will contain all the map ids
   * separated by commas
   * @param host
   * @param maps
   * @return
   * @throws MalformedURLException
   */
private URL getMapOutputURL(MapHost host, Collection<TaskAttemptID> maps) throws MalformedURLException {
    // Get the base url
    StringBuffer url = new StringBuffer(host.getBaseUrl());
    boolean first = true;
    for (TaskAttemptID mapId : maps) {
        if (!first) {
            url.append(",");
        }
        url.append(mapId);
        first = false;
    }
    LOG.debug("MapOutput URL for " + host + " -> " + url.toString());
    return new URL(url.toString());
}
Also used : TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) URL(java.net.URL)

Example 9 with TaskAttemptID

use of org.apache.hadoop.mapreduce.TaskAttemptID in project hadoop by apache.

the class LocalFetcher method doCopy.

/**
   * The crux of the matter...
   */
private void doCopy(Set<TaskAttemptID> maps) throws IOException {
    Iterator<TaskAttemptID> iter = maps.iterator();
    while (iter.hasNext()) {
        TaskAttemptID map = iter.next();
        LOG.debug("LocalFetcher " + id + " going to fetch: " + map);
        if (copyMapOutput(map)) {
            // Successful copy. Remove this from our worklist.
            iter.remove();
        } else {
            // and block for InMemoryMerge.
            break;
        }
    }
}
Also used : TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID)

Example 10 with TaskAttemptID

use of org.apache.hadoop.mapreduce.TaskAttemptID in project hadoop by apache.

the class TestClientRedirect method testRedirect.

@Test
public void testRedirect() throws Exception {
    Configuration conf = new YarnConfiguration();
    conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_FRAMEWORK_NAME);
    conf.set(YarnConfiguration.RM_ADDRESS, RMADDRESS);
    conf.set(JHAdminConfig.MR_HISTORY_ADDRESS, HSHOSTADDRESS);
    // Start the RM.
    RMService rmService = new RMService("test");
    rmService.init(conf);
    rmService.start();
    // Start the AM.
    AMService amService = new AMService();
    amService.init(conf);
    amService.start(conf);
    // Start the HS.
    HistoryService historyService = new HistoryService();
    historyService.init(conf);
    historyService.start(conf);
    LOG.info("services started");
    Cluster cluster = new Cluster(conf);
    org.apache.hadoop.mapreduce.JobID jobID = new org.apache.hadoop.mapred.JobID("201103121733", 1);
    org.apache.hadoop.mapreduce.Counters counters = cluster.getJob(jobID).getCounters();
    validateCounters(counters);
    Assert.assertTrue(amContact);
    LOG.info("Sleeping for 5 seconds before stop for" + " the client socket to not get EOF immediately..");
    Thread.sleep(5000);
    //bring down the AM service
    amService.stop();
    LOG.info("Sleeping for 5 seconds after stop for" + " the server to exit cleanly..");
    Thread.sleep(5000);
    amRestarting = true;
    // Same client
    //results are returned from fake (not started job)
    counters = cluster.getJob(jobID).getCounters();
    Assert.assertEquals(0, counters.countCounters());
    Job job = cluster.getJob(jobID);
    org.apache.hadoop.mapreduce.TaskID taskId = new org.apache.hadoop.mapreduce.TaskID(jobID, TaskType.MAP, 0);
    TaskAttemptID tId = new TaskAttemptID(taskId, 0);
    //invoke all methods to check that no exception is thrown
    job.killJob();
    job.killTask(tId);
    job.failTask(tId);
    job.getTaskCompletionEvents(0, 100);
    job.getStatus();
    job.getTaskDiagnostics(tId);
    job.getTaskReports(TaskType.MAP);
    job.getTrackingURL();
    amRestarting = false;
    amService = new AMService();
    amService.init(conf);
    amService.start(conf);
    //reset
    amContact = false;
    counters = cluster.getJob(jobID).getCounters();
    validateCounters(counters);
    Assert.assertTrue(amContact);
    // Stop the AM. It is not even restarting. So it should be treated as
    // completed.
    amService.stop();
    // Same client
    counters = cluster.getJob(jobID).getCounters();
    validateCounters(counters);
    Assert.assertTrue(hsContact);
    rmService.stop();
    historyService.stop();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) Cluster(org.apache.hadoop.mapreduce.Cluster) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Aggregations

TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)78 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)35 Test (org.junit.Test)34 Configuration (org.apache.hadoop.conf.Configuration)28 Path (org.apache.hadoop.fs.Path)25 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)22 IOException (java.io.IOException)19 JobID (org.apache.hadoop.mapreduce.JobID)16 TaskID (org.apache.hadoop.mapreduce.TaskID)15 File (java.io.File)14 Job (org.apache.hadoop.mapreduce.Job)14 ArrayList (java.util.ArrayList)13 JobContext (org.apache.hadoop.mapreduce.JobContext)12 LongWritable (org.apache.hadoop.io.LongWritable)11 InputSplit (org.apache.hadoop.mapreduce.InputSplit)10 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)10 FileSystem (org.apache.hadoop.fs.FileSystem)9 TaskAttemptInfo (org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo)8 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)8 HashMap (java.util.HashMap)7