Search in sources :

Example 56 with DataOutputStream

use of java.io.DataOutputStream in project hadoop by apache.

the class TeraInputFormat method writePartitionFile.

/**
   * Use the input splits to take samples of the input and generate sample
   * keys. By default reads 100,000 keys from 10 locations in the input, sorts
   * them and picks N-1 keys to generate N equally sized partitions.
   * @param job the job to sample
   * @param partFile where to write the output file to
   * @throws Throwable if something goes wrong
   */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {

            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println("Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {
                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) DataOutputStream(java.io.DataOutputStream) RecordReader(org.apache.hadoop.mapreduce.RecordReader) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 57 with DataOutputStream

use of java.io.DataOutputStream in project hadoop by apache.

the class TestIndexCache method testBadIndex.

public void testBadIndex() throws Exception {
    final int parts = 30;
    fs.delete(p, true);
    conf.setInt(TTConfig.TT_INDEX_CACHE, 1);
    IndexCache cache = new IndexCache(conf);
    Path f = new Path(p, "badindex");
    FSDataOutputStream out = fs.create(f, false);
    CheckedOutputStream iout = new CheckedOutputStream(out, new CRC32());
    DataOutputStream dout = new DataOutputStream(iout);
    for (int i = 0; i < parts; ++i) {
        for (int j = 0; j < MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH / 8; ++j) {
            if (0 == (i % 3)) {
                dout.writeLong(i);
            } else {
                out.writeLong(i);
            }
        }
    }
    out.writeLong(iout.getChecksum().getValue());
    dout.close();
    try {
        cache.getIndexInformation("badindex", 7, f, UserGroupInformation.getCurrentUser().getShortUserName());
        fail("Did not detect bad checksum");
    } catch (IOException e) {
        if (!(e.getCause() instanceof ChecksumException)) {
            throw e;
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) CRC32(java.util.zip.CRC32) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) DataOutputStream(java.io.DataOutputStream) ChecksumException(org.apache.hadoop.fs.ChecksumException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) CheckedOutputStream(java.util.zip.CheckedOutputStream) IOException(java.io.IOException)

Example 58 with DataOutputStream

use of java.io.DataOutputStream in project hadoop by apache.

the class TestJobInfo method testJobInfo.

@Test(timeout = 5000)
public void testJobInfo() throws IOException {
    JobID jid = new JobID("001", 1);
    Text user = new Text("User");
    Path path = new Path("/tmp/test");
    JobInfo info = new JobInfo(jid, user, path);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    info.write(new DataOutputStream(out));
    JobInfo copyinfo = new JobInfo();
    copyinfo.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray())));
    assertEquals(info.getJobID().toString(), copyinfo.getJobID().toString());
    assertEquals(info.getJobSubmitDir().getName(), copyinfo.getJobSubmitDir().getName());
    assertEquals(info.getUser().toString(), copyinfo.getUser().toString());
}
Also used : Path(org.apache.hadoop.fs.Path) ByteArrayInputStream(java.io.ByteArrayInputStream) DataOutputStream(java.io.DataOutputStream) Text(org.apache.hadoop.io.Text) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataInputStream(java.io.DataInputStream) JobID(org.apache.hadoop.mapreduce.JobID) Test(org.junit.Test)

Example 59 with DataOutputStream

use of java.io.DataOutputStream in project hadoop by apache.

the class TestOldMethodsJobID method testJobProfile.

/**
   * test depricated methods of JobProfile
   * @throws IOException
   */
@SuppressWarnings("deprecation")
@Test(timeout = 5000)
public void testJobProfile() throws IOException {
    JobProfile profile = new JobProfile("user", "job_001_03", "jobFile", "uri", "name");
    assertEquals("job_001_0003", profile.getJobId());
    assertEquals("default", profile.getQueueName());
    // serialization test
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    profile.write(new DataOutputStream(out));
    JobProfile profile2 = new JobProfile();
    profile2.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray())));
    assertEquals(profile2.name, profile.name);
    assertEquals(profile2.jobFile, profile.jobFile);
    assertEquals(profile2.queueName, profile.queueName);
    assertEquals(profile2.url, profile.url);
    assertEquals(profile2.user, profile.user);
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) DataOutputStream(java.io.DataOutputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataInputStream(java.io.DataInputStream) Test(org.junit.Test)

Example 60 with DataOutputStream

use of java.io.DataOutputStream in project hadoop by apache.

the class TestMultiFileSplit method testReadWrite.

@Test
public void testReadWrite() throws Exception {
    MultiFileSplit split = new MultiFileSplit(new JobConf(), new Path[] { new Path("/test/path/1"), new Path("/test/path/2") }, new long[] { 100, 200 });
    ByteArrayOutputStream bos = null;
    byte[] result = null;
    try {
        bos = new ByteArrayOutputStream();
        split.write(new DataOutputStream(bos));
        result = bos.toByteArray();
    } finally {
        IOUtils.closeStream(bos);
    }
    MultiFileSplit readSplit = new MultiFileSplit();
    ByteArrayInputStream bis = null;
    try {
        bis = new ByteArrayInputStream(result);
        readSplit.readFields(new DataInputStream(bis));
    } finally {
        IOUtils.closeStream(bis);
    }
    assertTrue(split.getLength() != 0);
    assertEquals(split.getLength(), readSplit.getLength());
    assertTrue(Arrays.equals(split.getPaths(), readSplit.getPaths()));
    assertTrue(Arrays.equals(split.getLengths(), readSplit.getLengths()));
    System.out.println(split.toString());
}
Also used : Path(org.apache.hadoop.fs.Path) ByteArrayInputStream(java.io.ByteArrayInputStream) DataOutputStream(java.io.DataOutputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataInputStream(java.io.DataInputStream) Test(org.junit.Test)

Aggregations

DataOutputStream (java.io.DataOutputStream)2968 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1314 IOException (java.io.IOException)1024 Test (org.junit.Test)633 DataInputStream (java.io.DataInputStream)615 FileOutputStream (java.io.FileOutputStream)427 ByteArrayInputStream (java.io.ByteArrayInputStream)411 File (java.io.File)281 BufferedOutputStream (java.io.BufferedOutputStream)228 UnitTest (org.apache.geode.test.junit.categories.UnitTest)172 URL (java.net.URL)149 InputStreamReader (java.io.InputStreamReader)146 BufferedReader (java.io.BufferedReader)142 Path (org.apache.hadoop.fs.Path)137 DataInput (java.io.DataInput)124 ArrayList (java.util.ArrayList)122 HttpURLConnection (java.net.HttpURLConnection)120 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)117 FileInputStream (java.io.FileInputStream)107 InputStream (java.io.InputStream)107