Search in sources :

Example 1 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class CompositeInputSplit method readFields.

/**
   * {@inheritDoc}
   * @throws IOException If the child InputSplit cannot be read, typically
   *                     for failing access checks.
   */
// Generic array assignment
@SuppressWarnings("unchecked")
public void readFields(DataInput in) throws IOException {
    int card = WritableUtils.readVInt(in);
    if (splits == null || splits.length != card) {
        splits = new InputSplit[card];
    }
    Class<? extends InputSplit>[] cls = new Class[card];
    try {
        for (int i = 0; i < card; ++i) {
            cls[i] = Class.forName(Text.readString(in)).asSubclass(InputSplit.class);
        }
        for (int i = 0; i < card; ++i) {
            splits[i] = ReflectionUtils.newInstance(cls[i], null);
            SerializationFactory factory = new SerializationFactory(conf);
            Deserializer deserializer = factory.getDeserializer(cls[i]);
            deserializer.open((DataInputStream) in);
            splits[i] = (InputSplit) deserializer.deserialize(splits[i]);
        }
    } catch (ClassNotFoundException e) {
        throw new IOException("Failed split init", e);
    }
}
Also used : IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 2 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class CompositeInputSplit method write.

/**
   * Write splits in the following format.
   * {@code
   * <count><class1><class2>...<classn><split1><split2>...<splitn>
   * }
   */
@SuppressWarnings("unchecked")
public void write(DataOutput out) throws IOException {
    WritableUtils.writeVInt(out, splits.length);
    for (InputSplit s : splits) {
        Text.writeString(out, s.getClass().getName());
    }
    for (InputSplit s : splits) {
        SerializationFactory factory = new SerializationFactory(conf);
        Serializer serializer = factory.getSerializer(s.getClass());
        serializer.open((DataOutputStream) out);
        serializer.serialize(s);
    }
}
Also used : InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 3 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TeraInputFormat method writePartitionFile.

/**
   * Use the input splits to take samples of the input and generate sample
   * keys. By default reads 100,000 keys from 10 locations in the input, sorts
   * them and picks N-1 keys to generate N equally sized partitions.
   * @param job the job to sample
   * @param partFile where to write the output file to
   * @throws Throwable if something goes wrong
   */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {

            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println("Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {
                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) DataOutputStream(java.io.DataOutputStream) RecordReader(org.apache.hadoop.mapreduce.RecordReader) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 4 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestDbClasses method testDataDrivenDBInputFormat.

@Test(timeout = 10000)
public void testDataDrivenDBInputFormat() throws Exception {
    JobContext jobContext = mock(JobContext.class);
    Configuration configuration = new Configuration();
    configuration.setInt(MRJobConfig.NUM_MAPS, 1);
    when(jobContext.getConfiguration()).thenReturn(configuration);
    DataDrivenDBInputFormat<NullDBWritable> format = new DataDrivenDBInputFormat<NullDBWritable>();
    List<InputSplit> splits = format.getSplits(jobContext);
    assertEquals(1, splits.size());
    DataDrivenDBInputSplit split = (DataDrivenDBInputSplit) splits.get(0);
    assertEquals("1=1", split.getLowerClause());
    assertEquals("1=1", split.getUpperClause());
    // 2
    configuration.setInt(MRJobConfig.NUM_MAPS, 2);
    DataDrivenDBInputFormat.setBoundingQuery(configuration, "query");
    assertEquals("query", configuration.get(DBConfiguration.INPUT_BOUNDING_QUERY));
    Job job = mock(Job.class);
    when(job.getConfiguration()).thenReturn(configuration);
    DataDrivenDBInputFormat.setInput(job, NullDBWritable.class, "query", "Bounding Query");
    assertEquals("Bounding Query", configuration.get(DBConfiguration.INPUT_BOUNDING_QUERY));
}
Also used : DataDrivenDBInputSplit(org.apache.hadoop.mapreduce.lib.db.DataDrivenDBInputFormat.DataDrivenDBInputSplit) Configuration(org.apache.hadoop.conf.Configuration) NullDBWritable(org.apache.hadoop.mapreduce.lib.db.DBInputFormat.NullDBWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) DBInputSplit(org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit) DataDrivenDBInputSplit(org.apache.hadoop.mapreduce.lib.db.DataDrivenDBInputFormat.DataDrivenDBInputSplit) Test(org.junit.Test)

Example 5 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestSplitters method testBooleanSplitter.

@Test(timeout = 2000)
public void testBooleanSplitter() throws Exception {
    BooleanSplitter splitter = new BooleanSplitter();
    ResultSet result = mock(ResultSet.class);
    when(result.getString(1)).thenReturn("result1");
    List<InputSplit> splits = splitter.split(configuration, result, "column");
    assertSplits(new String[] { "column = FALSE column = FALSE", "column IS NULL column IS NULL" }, splits);
    when(result.getString(1)).thenReturn("result1");
    when(result.getString(2)).thenReturn("result2");
    when(result.getBoolean(1)).thenReturn(true);
    when(result.getBoolean(2)).thenReturn(false);
    splits = splitter.split(configuration, result, "column");
    assertEquals(0, splits.size());
    when(result.getString(1)).thenReturn("result1");
    when(result.getString(2)).thenReturn("result2");
    when(result.getBoolean(1)).thenReturn(false);
    when(result.getBoolean(2)).thenReturn(true);
    splits = splitter.split(configuration, result, "column");
    assertSplits(new String[] { "column = FALSE column = FALSE", ".*column = TRUE" }, splits);
}
Also used : ResultSet(java.sql.ResultSet) InputSplit(org.apache.hadoop.mapreduce.InputSplit) DataDrivenDBInputSplit(org.apache.hadoop.mapreduce.lib.db.DataDrivenDBInputFormat.DataDrivenDBInputSplit) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9