use of org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl in project hadoop by apache.
the class TestRecovery method writeOutput.
private void writeOutput(TaskAttempt attempt, Configuration conf) throws Exception {
TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, TypeConverter.fromYarn(attempt.getID()));
TextOutputFormat<?, ?> theOutputFormat = new TextOutputFormat();
RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(tContext);
NullWritable nullWritable = NullWritable.get();
try {
theRecordWriter.write(key1, val1);
theRecordWriter.write(null, nullWritable);
theRecordWriter.write(null, val1);
theRecordWriter.write(nullWritable, val2);
theRecordWriter.write(key2, nullWritable);
theRecordWriter.write(key1, null);
theRecordWriter.write(null, null);
theRecordWriter.write(key2, val2);
} finally {
theRecordWriter.close(tContext);
}
OutputFormat outputFormat = ReflectionUtils.newInstance(tContext.getOutputFormatClass(), conf);
OutputCommitter committer = outputFormat.getOutputCommitter(tContext);
committer.commitTask(tContext);
}
use of org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl in project hadoop by apache.
the class TeraInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
* @param job the job to sample
* @param partFile where to write the output file to
* @throws Throwable if something goes wrong
*/
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
long t1 = System.currentTimeMillis();
Configuration conf = job.getConfiguration();
final TeraInputFormat inFormat = new TeraInputFormat();
final TextSampler sampler = new TextSampler();
int partitions = job.getNumReduceTasks();
long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
final List<InputSplit> splits = inFormat.getSplits(job);
long t2 = System.currentTimeMillis();
System.out.println("Computing input splits took " + (t2 - t1) + "ms");
int samples = Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size());
System.out.println("Sampling " + samples + " splits of " + splits.size());
final long recordsPerSample = sampleSize / samples;
final int sampleStep = splits.size() / samples;
Thread[] samplerReader = new Thread[samples];
SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
// take N samples from different parts of the input
for (int i = 0; i < samples; ++i) {
final int idx = i;
samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
{
setDaemon(true);
}
public void run() {
long records = 0;
try {
TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context);
reader.initialize(splits.get(sampleStep * idx), context);
while (reader.nextKeyValue()) {
sampler.addKey(new Text(reader.getCurrentKey()));
records += 1;
if (recordsPerSample <= records) {
break;
}
}
} catch (IOException ie) {
System.err.println("Got an exception while reading splits " + StringUtils.stringifyException(ie));
throw new RuntimeException(ie);
} catch (InterruptedException e) {
}
}
};
samplerReader[i].start();
}
FileSystem outFs = partFile.getFileSystem(conf);
DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile));
for (int i = 0; i < samples; i++) {
try {
samplerReader[i].join();
if (threadGroup.getThrowable() != null) {
throw threadGroup.getThrowable();
}
} catch (InterruptedException e) {
}
}
for (Text split : sampler.createPartitions(partitions)) {
split.write(writer);
}
writer.close();
long t3 = System.currentTimeMillis();
System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}
use of org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl in project hadoop by apache.
the class TestLineRecordReader method testMultipleClose.
@Test
public void testMultipleClose() throws IOException {
URL testFileUrl = getClass().getClassLoader().getResource("recordSpanningMultipleSplits.txt.bz2");
assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl);
File testFile = new File(testFileUrl.getFile());
Path testFilePath = new Path(testFile.getAbsolutePath());
long testFileSize = testFile.length();
Configuration conf = new Configuration();
conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
// read the data and check whether BOM is skipped
FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null);
LineRecordReader reader = new LineRecordReader();
reader.initialize(split, context);
//noinspection StatementWithEmptyBody
while (reader.nextKeyValue()) ;
reader.close();
reader.close();
BZip2Codec codec = new BZip2Codec();
codec.setConf(conf);
Set<Decompressor> decompressors = new HashSet<Decompressor>();
for (int i = 0; i < 10; ++i) {
decompressors.add(CodecPool.getDecompressor(codec));
}
assertEquals(10, decompressors.size());
}
use of org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl in project hadoop by apache.
the class TestLineRecordReader method testUncompressedInputDefaultDelimiterPosValue.
@Test
public void testUncompressedInputDefaultDelimiterPosValue() throws Exception {
Configuration conf = new Configuration();
String inputData = "1234567890\r\n12\r\n345";
Path inputFile = createInputFile(conf, inputData);
conf.setInt("io.file.buffer.size", 10);
conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
FileSplit split = new FileSplit(inputFile, 0, 15, (String[]) null);
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
LineRecordReader reader = new LineRecordReader(null);
reader.initialize(split, context);
LongWritable key;
Text value;
reader.nextKeyValue();
key = reader.getCurrentKey();
value = reader.getCurrentValue();
// Get first record:"1234567890"
assertEquals(10, value.getLength());
assertEquals(0, key.get());
reader.nextKeyValue();
// Get second record:"12"
assertEquals(2, value.getLength());
// Key should be 12 right after "1234567890\r\n"
assertEquals(12, key.get());
assertFalse(reader.nextKeyValue());
// Key should be 16 right after "1234567890\r\n12\r\n"
assertEquals(16, key.get());
split = new FileSplit(inputFile, 15, 4, (String[]) null);
reader = new LineRecordReader(null);
reader.initialize(split, context);
// The second split dropped the first record "\n"
reader.nextKeyValue();
key = reader.getCurrentKey();
value = reader.getCurrentValue();
// Get third record:"345"
assertEquals(3, value.getLength());
// Key should be 16 right after "1234567890\r\n12\r\n"
assertEquals(16, key.get());
assertFalse(reader.nextKeyValue());
// Key should be 19 right after "1234567890\r\n12\r\n345"
assertEquals(19, key.get());
inputData = "123456789\r\r\n";
inputFile = createInputFile(conf, inputData);
split = new FileSplit(inputFile, 0, 12, (String[]) null);
reader = new LineRecordReader(null);
reader.initialize(split, context);
reader.nextKeyValue();
key = reader.getCurrentKey();
value = reader.getCurrentValue();
// Get first record:"123456789"
assertEquals(9, value.getLength());
assertEquals(0, key.get());
reader.nextKeyValue();
// Get second record:""
assertEquals(0, value.getLength());
// Key should be 10 right after "123456789\r"
assertEquals(10, key.get());
assertFalse(reader.nextKeyValue());
// Key should be 12 right after "123456789\r\r\n"
assertEquals(12, key.get());
}
use of org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl in project hadoop by apache.
the class TestLineRecordReader method readRecords.
// Use the LineRecordReader to read records from the file
public ArrayList<String> readRecords(URL testFileUrl, int splitSize) throws IOException {
// Set up context
File testFile = new File(testFileUrl.getFile());
long testFileSize = testFile.length();
Path testFilePath = new Path(testFile.getAbsolutePath());
Configuration conf = new Configuration();
conf.setInt("io.file.buffer.size", 1);
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
// Gather the records returned by the record reader
ArrayList<String> records = new ArrayList<String>();
long offset = 0;
while (offset < testFileSize) {
FileSplit split = new FileSplit(testFilePath, offset, splitSize, null);
LineRecordReader reader = new LineRecordReader();
reader.initialize(split, context);
while (reader.nextKeyValue()) {
records.add(reader.getCurrentValue().toString());
}
offset += splitSize;
}
return records;
}
Aggregations