use of org.apache.hadoop.mapreduce.TaskAttemptID in project hadoop by apache.
the class TeraInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
* @param job the job to sample
* @param partFile where to write the output file to
* @throws Throwable if something goes wrong
*/
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
long t1 = System.currentTimeMillis();
Configuration conf = job.getConfiguration();
final TeraInputFormat inFormat = new TeraInputFormat();
final TextSampler sampler = new TextSampler();
int partitions = job.getNumReduceTasks();
long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
final List<InputSplit> splits = inFormat.getSplits(job);
long t2 = System.currentTimeMillis();
System.out.println("Computing input splits took " + (t2 - t1) + "ms");
int samples = Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size());
System.out.println("Sampling " + samples + " splits of " + splits.size());
final long recordsPerSample = sampleSize / samples;
final int sampleStep = splits.size() / samples;
Thread[] samplerReader = new Thread[samples];
SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
// take N samples from different parts of the input
for (int i = 0; i < samples; ++i) {
final int idx = i;
samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
{
setDaemon(true);
}
public void run() {
long records = 0;
try {
TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context);
reader.initialize(splits.get(sampleStep * idx), context);
while (reader.nextKeyValue()) {
sampler.addKey(new Text(reader.getCurrentKey()));
records += 1;
if (recordsPerSample <= records) {
break;
}
}
} catch (IOException ie) {
System.err.println("Got an exception while reading splits " + StringUtils.stringifyException(ie));
throw new RuntimeException(ie);
} catch (InterruptedException e) {
}
}
};
samplerReader[i].start();
}
FileSystem outFs = partFile.getFileSystem(conf);
DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile));
for (int i = 0; i < samples; i++) {
try {
samplerReader[i].join();
if (threadGroup.getThrowable() != null) {
throw threadGroup.getThrowable();
}
} catch (InterruptedException e) {
}
}
for (Text split : sampler.createPartitions(partitions)) {
split.write(writer);
}
writer.close();
long t3 = System.currentTimeMillis();
System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}
use of org.apache.hadoop.mapreduce.TaskAttemptID in project hadoop by apache.
the class JobBuilder method build.
/**
* Request the builder to build the final object. Once called, the
* {@link JobBuilder} would accept no more events or job-conf properties.
*
* @return Parsed {@link ParsedJob} object.
*/
public ParsedJob build() {
// The main job here is to build CDFs and manage the conf
finalized = true;
// set the conf
if (jobConfigurationParameters != null) {
result.setJobProperties(jobConfigurationParameters);
}
// initialize all the per-job statistics gathering places
Histogram[] successfulMapAttemptTimes = new Histogram[ParsedHost.numberOfDistances() + 1];
for (int i = 0; i < successfulMapAttemptTimes.length; ++i) {
successfulMapAttemptTimes[i] = new Histogram();
}
Histogram successfulReduceAttemptTimes = new Histogram();
Histogram[] failedMapAttemptTimes = new Histogram[ParsedHost.numberOfDistances() + 1];
for (int i = 0; i < failedMapAttemptTimes.length; ++i) {
failedMapAttemptTimes[i] = new Histogram();
}
Histogram failedReduceAttemptTimes = new Histogram();
Histogram successfulNthMapperAttempts = new Histogram();
for (LoggedTask task : result.getMapTasks()) {
for (LoggedTaskAttempt attempt : task.getAttempts()) {
int distance = successfulMapAttemptTimes.length - 1;
Long runtime = null;
if (attempt.getFinishTime() > 0 && attempt.getStartTime() > 0) {
runtime = attempt.getFinishTime() - attempt.getStartTime();
if (attempt.getResult() == Values.SUCCESS) {
LoggedLocation host = attempt.getLocation();
List<LoggedLocation> locs = task.getPreferredLocations();
if (host != null && locs != null) {
for (LoggedLocation loc : locs) {
ParsedHost preferedLoc = new ParsedHost(loc);
distance = Math.min(distance, preferedLoc.distance(new ParsedHost(host)));
}
// mapperLocality.enter(distance);
}
if (attempt.getStartTime() > 0 && attempt.getFinishTime() > 0) {
if (runtime != null) {
successfulMapAttemptTimes[distance].enter(runtime);
}
}
TaskAttemptID attemptID = attempt.getAttemptID();
if (attemptID != null) {
successfulNthMapperAttempts.enter(attemptID.getId());
}
} else {
if (attempt.getResult() == Pre21JobHistoryConstants.Values.FAILED) {
if (runtime != null) {
failedMapAttemptTimes[distance].enter(runtime);
}
}
}
}
}
}
for (LoggedTask task : result.getReduceTasks()) {
for (LoggedTaskAttempt attempt : task.getAttempts()) {
Long runtime = attempt.getFinishTime() - attempt.getStartTime();
if (attempt.getFinishTime() > 0 && attempt.getStartTime() > 0) {
runtime = attempt.getFinishTime() - attempt.getStartTime();
}
if (attempt.getResult() == Values.SUCCESS) {
if (runtime != null) {
successfulReduceAttemptTimes.enter(runtime);
}
} else if (attempt.getResult() == Pre21JobHistoryConstants.Values.FAILED) {
failedReduceAttemptTimes.enter(runtime);
}
}
}
result.setFailedMapAttemptCDFs(mapCDFArrayList(failedMapAttemptTimes));
LoggedDiscreteCDF failedReduce = new LoggedDiscreteCDF();
failedReduce.setCDF(failedReduceAttemptTimes, attemptTimesPercentiles, 100);
result.setFailedReduceAttemptCDF(failedReduce);
result.setSuccessfulMapAttemptCDFs(mapCDFArrayList(successfulMapAttemptTimes));
LoggedDiscreteCDF succReduce = new LoggedDiscreteCDF();
succReduce.setCDF(successfulReduceAttemptTimes, attemptTimesPercentiles, 100);
result.setSuccessfulReduceAttemptCDF(succReduce);
long totalSuccessfulAttempts = 0L;
long maxTriesToSucceed = 0L;
for (Map.Entry<Long, Long> ent : successfulNthMapperAttempts) {
totalSuccessfulAttempts += ent.getValue();
maxTriesToSucceed = Math.max(maxTriesToSucceed, ent.getKey());
}
if (totalSuccessfulAttempts > 0L) {
double[] successAfterI = new double[(int) maxTriesToSucceed + 1];
for (int i = 0; i < successAfterI.length; ++i) {
successAfterI[i] = 0.0D;
}
for (Map.Entry<Long, Long> ent : successfulNthMapperAttempts) {
successAfterI[ent.getKey().intValue()] = ((double) ent.getValue()) / totalSuccessfulAttempts;
}
result.setMapperTriesToSucceed(successAfterI);
} else {
result.setMapperTriesToSucceed(null);
}
return result;
}
use of org.apache.hadoop.mapreduce.TaskAttemptID in project hadoop by apache.
the class ZombieJob method getLoggedTaskAttempt.
private LoggedTaskAttempt getLoggedTaskAttempt(TaskType taskType, int taskNumber, int taskAttemptNumber) {
buildMaps();
TaskAttemptID id = new TaskAttemptID(getMaskedTaskID(taskType, taskNumber), taskAttemptNumber);
return loggedTaskAttemptMap.get(id);
}
use of org.apache.hadoop.mapreduce.TaskAttemptID in project hadoop by apache.
the class TestLineRecordReader method testMultipleClose.
@Test
public void testMultipleClose() throws IOException {
URL testFileUrl = getClass().getClassLoader().getResource("recordSpanningMultipleSplits.txt.bz2");
assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl);
File testFile = new File(testFileUrl.getFile());
Path testFilePath = new Path(testFile.getAbsolutePath());
long testFileSize = testFile.length();
Configuration conf = new Configuration();
conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
// read the data and check whether BOM is skipped
FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null);
LineRecordReader reader = new LineRecordReader();
reader.initialize(split, context);
//noinspection StatementWithEmptyBody
while (reader.nextKeyValue()) ;
reader.close();
reader.close();
BZip2Codec codec = new BZip2Codec();
codec.setConf(conf);
Set<Decompressor> decompressors = new HashSet<Decompressor>();
for (int i = 0; i < 10; ++i) {
decompressors.add(CodecPool.getDecompressor(codec));
}
assertEquals(10, decompressors.size());
}
use of org.apache.hadoop.mapreduce.TaskAttemptID in project hadoop by apache.
the class TestLineRecordReader method testUncompressedInputDefaultDelimiterPosValue.
@Test
public void testUncompressedInputDefaultDelimiterPosValue() throws Exception {
Configuration conf = new Configuration();
String inputData = "1234567890\r\n12\r\n345";
Path inputFile = createInputFile(conf, inputData);
conf.setInt("io.file.buffer.size", 10);
conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
FileSplit split = new FileSplit(inputFile, 0, 15, (String[]) null);
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
LineRecordReader reader = new LineRecordReader(null);
reader.initialize(split, context);
LongWritable key;
Text value;
reader.nextKeyValue();
key = reader.getCurrentKey();
value = reader.getCurrentValue();
// Get first record:"1234567890"
assertEquals(10, value.getLength());
assertEquals(0, key.get());
reader.nextKeyValue();
// Get second record:"12"
assertEquals(2, value.getLength());
// Key should be 12 right after "1234567890\r\n"
assertEquals(12, key.get());
assertFalse(reader.nextKeyValue());
// Key should be 16 right after "1234567890\r\n12\r\n"
assertEquals(16, key.get());
split = new FileSplit(inputFile, 15, 4, (String[]) null);
reader = new LineRecordReader(null);
reader.initialize(split, context);
// The second split dropped the first record "\n"
reader.nextKeyValue();
key = reader.getCurrentKey();
value = reader.getCurrentValue();
// Get third record:"345"
assertEquals(3, value.getLength());
// Key should be 16 right after "1234567890\r\n12\r\n"
assertEquals(16, key.get());
assertFalse(reader.nextKeyValue());
// Key should be 19 right after "1234567890\r\n12\r\n345"
assertEquals(19, key.get());
inputData = "123456789\r\r\n";
inputFile = createInputFile(conf, inputData);
split = new FileSplit(inputFile, 0, 12, (String[]) null);
reader = new LineRecordReader(null);
reader.initialize(split, context);
reader.nextKeyValue();
key = reader.getCurrentKey();
value = reader.getCurrentValue();
// Get first record:"123456789"
assertEquals(9, value.getLength());
assertEquals(0, key.get());
reader.nextKeyValue();
// Get second record:""
assertEquals(0, value.getLength());
// Key should be 10 right after "123456789\r"
assertEquals(10, key.get());
assertFalse(reader.nextKeyValue());
// Key should be 12 right after "123456789\r\r\n"
assertEquals(12, key.get());
}
Aggregations