use of java.io.DataOutputStream in project hadoop by apache.
the class TeraInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
* @param job the job to sample
* @param partFile where to write the output file to
* @throws Throwable if something goes wrong
*/
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
long t1 = System.currentTimeMillis();
Configuration conf = job.getConfiguration();
final TeraInputFormat inFormat = new TeraInputFormat();
final TextSampler sampler = new TextSampler();
int partitions = job.getNumReduceTasks();
long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
final List<InputSplit> splits = inFormat.getSplits(job);
long t2 = System.currentTimeMillis();
System.out.println("Computing input splits took " + (t2 - t1) + "ms");
int samples = Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size());
System.out.println("Sampling " + samples + " splits of " + splits.size());
final long recordsPerSample = sampleSize / samples;
final int sampleStep = splits.size() / samples;
Thread[] samplerReader = new Thread[samples];
SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
// take N samples from different parts of the input
for (int i = 0; i < samples; ++i) {
final int idx = i;
samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
{
setDaemon(true);
}
public void run() {
long records = 0;
try {
TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context);
reader.initialize(splits.get(sampleStep * idx), context);
while (reader.nextKeyValue()) {
sampler.addKey(new Text(reader.getCurrentKey()));
records += 1;
if (recordsPerSample <= records) {
break;
}
}
} catch (IOException ie) {
System.err.println("Got an exception while reading splits " + StringUtils.stringifyException(ie));
throw new RuntimeException(ie);
} catch (InterruptedException e) {
}
}
};
samplerReader[i].start();
}
FileSystem outFs = partFile.getFileSystem(conf);
DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile));
for (int i = 0; i < samples; i++) {
try {
samplerReader[i].join();
if (threadGroup.getThrowable() != null) {
throw threadGroup.getThrowable();
}
} catch (InterruptedException e) {
}
}
for (Text split : sampler.createPartitions(partitions)) {
split.write(writer);
}
writer.close();
long t3 = System.currentTimeMillis();
System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}
use of java.io.DataOutputStream in project hadoop by apache.
the class TestIndexCache method testBadIndex.
public void testBadIndex() throws Exception {
final int parts = 30;
fs.delete(p, true);
conf.setInt(TTConfig.TT_INDEX_CACHE, 1);
IndexCache cache = new IndexCache(conf);
Path f = new Path(p, "badindex");
FSDataOutputStream out = fs.create(f, false);
CheckedOutputStream iout = new CheckedOutputStream(out, new CRC32());
DataOutputStream dout = new DataOutputStream(iout);
for (int i = 0; i < parts; ++i) {
for (int j = 0; j < MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH / 8; ++j) {
if (0 == (i % 3)) {
dout.writeLong(i);
} else {
out.writeLong(i);
}
}
}
out.writeLong(iout.getChecksum().getValue());
dout.close();
try {
cache.getIndexInformation("badindex", 7, f, UserGroupInformation.getCurrentUser().getShortUserName());
fail("Did not detect bad checksum");
} catch (IOException e) {
if (!(e.getCause() instanceof ChecksumException)) {
throw e;
}
}
}
use of java.io.DataOutputStream in project hadoop by apache.
the class TestJobInfo method testJobInfo.
@Test(timeout = 5000)
public void testJobInfo() throws IOException {
JobID jid = new JobID("001", 1);
Text user = new Text("User");
Path path = new Path("/tmp/test");
JobInfo info = new JobInfo(jid, user, path);
ByteArrayOutputStream out = new ByteArrayOutputStream();
info.write(new DataOutputStream(out));
JobInfo copyinfo = new JobInfo();
copyinfo.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray())));
assertEquals(info.getJobID().toString(), copyinfo.getJobID().toString());
assertEquals(info.getJobSubmitDir().getName(), copyinfo.getJobSubmitDir().getName());
assertEquals(info.getUser().toString(), copyinfo.getUser().toString());
}
use of java.io.DataOutputStream in project hadoop by apache.
the class TestOldMethodsJobID method testJobProfile.
/**
* test depricated methods of JobProfile
* @throws IOException
*/
@SuppressWarnings("deprecation")
@Test(timeout = 5000)
public void testJobProfile() throws IOException {
JobProfile profile = new JobProfile("user", "job_001_03", "jobFile", "uri", "name");
assertEquals("job_001_0003", profile.getJobId());
assertEquals("default", profile.getQueueName());
// serialization test
ByteArrayOutputStream out = new ByteArrayOutputStream();
profile.write(new DataOutputStream(out));
JobProfile profile2 = new JobProfile();
profile2.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray())));
assertEquals(profile2.name, profile.name);
assertEquals(profile2.jobFile, profile.jobFile);
assertEquals(profile2.queueName, profile.queueName);
assertEquals(profile2.url, profile.url);
assertEquals(profile2.user, profile.user);
}
use of java.io.DataOutputStream in project hadoop by apache.
the class TestMultiFileSplit method testReadWrite.
@Test
public void testReadWrite() throws Exception {
MultiFileSplit split = new MultiFileSplit(new JobConf(), new Path[] { new Path("/test/path/1"), new Path("/test/path/2") }, new long[] { 100, 200 });
ByteArrayOutputStream bos = null;
byte[] result = null;
try {
bos = new ByteArrayOutputStream();
split.write(new DataOutputStream(bos));
result = bos.toByteArray();
} finally {
IOUtils.closeStream(bos);
}
MultiFileSplit readSplit = new MultiFileSplit();
ByteArrayInputStream bis = null;
try {
bis = new ByteArrayInputStream(result);
readSplit.readFields(new DataInputStream(bis));
} finally {
IOUtils.closeStream(bis);
}
assertTrue(split.getLength() != 0);
assertEquals(split.getLength(), readSplit.getLength());
assertTrue(Arrays.equals(split.getPaths(), readSplit.getPaths()));
assertTrue(Arrays.equals(split.getLengths(), readSplit.getLengths()));
System.out.println(split.toString());
}
Aggregations