Search in sources :

Example 6 with JobStory

use of org.apache.hadoop.tools.rumen.JobStory in project hadoop by apache.

the class LoadJob method buildSplits.

@Override
void buildSplits(FilePool inputDir) throws IOException {
    long mapInputBytesTotal = 0L;
    long mapOutputBytesTotal = 0L;
    long mapOutputRecordsTotal = 0L;
    final JobStory jobdesc = getJobDesc();
    if (null == jobdesc) {
        return;
    }
    final int maps = jobdesc.getNumberMaps();
    final int reds = jobdesc.getNumberReduces();
    for (int i = 0; i < maps; ++i) {
        final TaskInfo info = jobdesc.getTaskInfo(TaskType.MAP, i);
        mapInputBytesTotal += info.getInputBytes();
        mapOutputBytesTotal += info.getOutputBytes();
        mapOutputRecordsTotal += info.getOutputRecords();
    }
    final double[] reduceRecordRatio = new double[reds];
    final double[] reduceByteRatio = new double[reds];
    for (int i = 0; i < reds; ++i) {
        final TaskInfo info = jobdesc.getTaskInfo(TaskType.REDUCE, i);
        reduceByteRatio[i] = info.getInputBytes() / (1.0 * mapOutputBytesTotal);
        reduceRecordRatio[i] = info.getInputRecords() / (1.0 * mapOutputRecordsTotal);
    }
    final InputStriper striper = new InputStriper(inputDir, mapInputBytesTotal);
    final List<InputSplit> splits = new ArrayList<InputSplit>();
    for (int i = 0; i < maps; ++i) {
        final int nSpec = reds / maps + ((reds % maps) > i ? 1 : 0);
        final long[] specBytes = new long[nSpec];
        final long[] specRecords = new long[nSpec];
        final ResourceUsageMetrics[] metrics = new ResourceUsageMetrics[nSpec];
        for (int j = 0; j < nSpec; ++j) {
            final TaskInfo info = jobdesc.getTaskInfo(TaskType.REDUCE, i + j * maps);
            specBytes[j] = info.getOutputBytes();
            specRecords[j] = info.getOutputRecords();
            metrics[j] = info.getResourceUsageMetrics();
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("SPEC(%d) %d -> %d %d %d %d %d %d %d", id(), i, i + j * maps, info.getOutputRecords(), info.getOutputBytes(), info.getResourceUsageMetrics().getCumulativeCpuUsage(), info.getResourceUsageMetrics().getPhysicalMemoryUsage(), info.getResourceUsageMetrics().getVirtualMemoryUsage(), info.getResourceUsageMetrics().getHeapUsage()));
            }
        }
        final TaskInfo info = jobdesc.getTaskInfo(TaskType.MAP, i);
        long possiblyCompressedInputBytes = info.getInputBytes();
        Configuration conf = job.getConfiguration();
        long uncompressedInputBytes = CompressionEmulationUtil.getUncompressedInputBytes(possiblyCompressedInputBytes, conf);
        splits.add(new LoadSplit(striper.splitFor(inputDir, uncompressedInputBytes, 3), maps, i, uncompressedInputBytes, info.getInputRecords(), info.getOutputBytes(), info.getOutputRecords(), reduceByteRatio, reduceRecordRatio, specBytes, specRecords, info.getResourceUsageMetrics(), metrics));
    }
    pushDescription(id(), splits);
}
Also used : ResourceUsageMetrics(org.apache.hadoop.tools.rumen.ResourceUsageMetrics) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) TaskInfo(org.apache.hadoop.tools.rumen.TaskInfo) JobStory(org.apache.hadoop.tools.rumen.JobStory) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Aggregations

JobStory (org.apache.hadoop.tools.rumen.JobStory)6 Configuration (org.apache.hadoop.conf.Configuration)4 Path (org.apache.hadoop.fs.Path)3 ArrayList (java.util.ArrayList)2 InputSplit (org.apache.hadoop.mapreduce.InputSplit)2 UserGroupInformation (org.apache.hadoop.security.UserGroupInformation)2 TaskInfo (org.apache.hadoop.tools.rumen.TaskInfo)2 Test (org.junit.Test)2 JobConf (org.apache.hadoop.mapred.JobConf)1 Job (org.apache.hadoop.mapreduce.Job)1 TaskType (org.apache.hadoop.mapreduce.TaskType)1 ResourceUsageMetrics (org.apache.hadoop.tools.rumen.ResourceUsageMetrics)1