Examples with InputSplit - org.apache.hadoop.mapreduce.InputSplit

Example 26 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestFixedLengthInputFormat method testZeroRecordLength.

/**
   * Test with record length set to 0
   */
@Test(timeout = 5000)
public void testZeroRecordLength() throws Exception {
    localFs.delete(workDir, true);
    Path file = new Path(workDir, new String("testFormat.txt"));
    createFile(file, null, 10, 10);
    Job job = Job.getInstance(defaultConf);
    // Set the fixed length record length config property 
    FixedLengthInputFormat format = new FixedLengthInputFormat();
    format.setRecordLength(job.getConfiguration(), 0);
    FileInputFormat.setInputPaths(job, workDir);
    List<InputSplit> splits = format.getSplits(job);
    boolean exceptionThrown = false;
    for (InputSplit split : splits) {
        try {
            TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
            RecordReader<LongWritable, BytesWritable> reader = format.createRecordReader(split, context);
            MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable> mcontext = new MapContextImpl<LongWritable, BytesWritable, LongWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
            reader.initialize(split, mcontext);
        } catch (IOException ioe) {
            exceptionThrown = true;
            LOG.info("Exception message:" + ioe.getMessage());
        }
    }
    assertTrue("Exception for zero record length:", exceptionThrown);
}

Also used : Path(org.apache.hadoop.fs.Path) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BytesWritable(org.apache.hadoop.io.BytesWritable) IOException(java.io.IOException) LongWritable(org.apache.hadoop.io.LongWritable) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 27 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestNLineInputFormat method checkFormat.

void checkFormat(Job job, int expectedN, int lastN) throws IOException, InterruptedException {
    NLineInputFormat format = new NLineInputFormat();
    List<InputSplit> splits = format.getSplits(job);
    int count = 0;
    for (int i = 0; i < splits.size(); i++) {
        assertEquals("There are no split locations", 0, splits.get(i).getLocations().length);
        TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<LongWritable, Text> reader = format.createRecordReader(splits.get(i), context);
        Class<?> clazz = reader.getClass();
        assertEquals("reader class is LineRecordReader.", LineRecordReader.class, clazz);
        MapContext<LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl<LongWritable, Text, LongWritable, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), splits.get(i));
        reader.initialize(splits.get(i), mcontext);
        try {
            count = 0;
            while (reader.nextKeyValue()) {
                count++;
            }
        } finally {
            reader.close();
        }
        if (i == splits.size() - 1) {
            assertEquals("number of lines in split(" + i + ") is wrong", lastN, count);
        } else {
            assertEquals("number of lines in split(" + i + ") is wrong", expectedN, count);
        }
    }
}

Also used : MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 28 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class DateSplitter method split.

public List<InputSplit> split(Configuration conf, ResultSet results, String colName) throws SQLException {
    long minVal;
    long maxVal;
    int sqlDataType = results.getMetaData().getColumnType(1);
    minVal = resultSetColToLong(results, 1, sqlDataType);
    maxVal = resultSetColToLong(results, 2, sqlDataType);
    String lowClausePrefix = colName + " >= ";
    String highClausePrefix = colName + " < ";
    int numSplits = conf.getInt(MRJobConfig.NUM_MAPS, 1);
    if (numSplits < 1) {
        numSplits = 1;
    }
    if (minVal == Long.MIN_VALUE && maxVal == Long.MIN_VALUE) {
        // The range of acceptable dates is NULL to NULL. Just create a single split.
        List<InputSplit> splits = new ArrayList<InputSplit>();
        splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(colName + " IS NULL", colName + " IS NULL"));
        return splits;
    }
    // Gather the split point integers
    List<Long> splitPoints = split(numSplits, minVal, maxVal);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    // Turn the split points into a set of intervals.
    long start = splitPoints.get(0);
    Date startDate = longToDate(start, sqlDataType);
    if (sqlDataType == Types.TIMESTAMP) {
        // The lower bound's nanos value needs to match the actual lower-bound nanos.
        try {
            ((java.sql.Timestamp) startDate).setNanos(results.getTimestamp(1).getNanos());
        } catch (NullPointerException npe) {
        // If the lower bound was NULL, we'll get an NPE; just ignore it and don't set nanos.
        }
    }
    for (int i = 1; i < splitPoints.size(); i++) {
        long end = splitPoints.get(i);
        Date endDate = longToDate(end, sqlDataType);
        if (i == splitPoints.size() - 1) {
            if (sqlDataType == Types.TIMESTAMP) {
                // The upper bound's nanos value needs to match the actual upper-bound nanos.
                try {
                    ((java.sql.Timestamp) endDate).setNanos(results.getTimestamp(2).getNanos());
                } catch (NullPointerException npe) {
                // If the upper bound was NULL, we'll get an NPE; just ignore it and don't set nanos.
                }
            }
            // This is the last one; use a closed interval.
            splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(lowClausePrefix + dateToString(startDate), colName + " <= " + dateToString(endDate)));
        } else {
            // Normal open-interval case.
            splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(lowClausePrefix + dateToString(startDate), highClausePrefix + dateToString(endDate)));
        }
        start = end;
        startDate = endDate;
    }
    if (minVal == Long.MIN_VALUE || maxVal == Long.MIN_VALUE) {
        // Add an extra split to handle the null case that we saw.
        splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(colName + " IS NULL", colName + " IS NULL"));
    }
    return splits;
}

Also used : ArrayList(java.util.ArrayList) Timestamp(java.sql.Timestamp) Date(java.util.Date) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 29 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class CombineFileInputFormat method getSplits.

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;
    } else {
        minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0);
    // If maxSize is not configured, a single split will be generated per
    // node.
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack " + minSizeRack + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node " + minSizeNode + " cannot be larger than minimum split " + "size per rack " + minSizeRack);
    }
    // all the files in input set
    List<FileStatus> stats = listStatus(job);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (stats.size() == 0) {
        return splits;
    }
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<FileStatus> myPaths = new ArrayList<FileStatus>();
        // add it to the output set
        for (Iterator<FileStatus> iter = stats.iterator(); iter.hasNext(); ) {
            FileStatus p = iter.next();
            if (onepool.accept(p.getPath())) {
                // add it to my output set
                myPaths.add(p);
                iter.remove();
            }
        }
        // create splits for all files in this pool.
        getMoreSplits(job, myPaths, maxSize, minSizeNode, minSizeRack, splits);
    }
    // create splits for all files that are not in any pool.
    getMoreSplits(job, stats, maxSize, minSizeNode, minSizeRack, splits);
    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
}

Also used : FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 30 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestSleepJob method testRandomLocation.

private void testRandomLocation(int locations, int njobs, UserGroupInformation ugi) throws Exception {
    Configuration configuration = new Configuration();
    DebugJobProducer jobProducer = new DebugJobProducer(njobs, configuration);
    Configuration jconf = GridmixTestUtils.mrvl.getConfig();
    jconf.setInt(JobCreator.SLEEPJOB_RANDOM_LOCATIONS, locations);
    JobStory story;
    int seq = 1;
    while ((story = jobProducer.getNextJob()) != null) {
        GridmixJob gridmixJob = JobCreator.SLEEPJOB.createGridmixJob(jconf, 0, story, new Path("ignored"), ugi, seq++);
        gridmixJob.buildSplits(null);
        List<InputSplit> splits = new SleepJob.SleepInputFormat().getSplits(gridmixJob.getJob());
        for (InputSplit split : splits) {
            assertEquals(locations, split.getLocations().length);
        }
    }
    jobProducer.close();
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) JobStory(org.apache.hadoop.tools.rumen.JobStory) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9