use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestFixedLengthInputFormat method testZeroRecordLength.
/**
* Test with record length set to 0
*/
@Test(timeout = 5000)
public void testZeroRecordLength() throws Exception {
localFs.delete(workDir, true);
Path file = new Path(workDir, new String("testFormat.txt"));
createFile(file, null, 10, 10);
Job job = Job.getInstance(defaultConf);
// Set the fixed length record length config property
FixedLengthInputFormat format = new FixedLengthInputFormat();
format.setRecordLength(job.getConfiguration(), 0);
FileInputFormat.setInputPaths(job, workDir);
List<InputSplit> splits = format.getSplits(job);
boolean exceptionThrown = false;
for (InputSplit split : splits) {
try {
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
RecordReader<LongWritable, BytesWritable> reader = format.createRecordReader(split, context);
MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable> mcontext = new MapContextImpl<LongWritable, BytesWritable, LongWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
} catch (IOException ioe) {
exceptionThrown = true;
LOG.info("Exception message:" + ioe.getMessage());
}
}
assertTrue("Exception for zero record length:", exceptionThrown);
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestNLineInputFormat method checkFormat.
void checkFormat(Job job, int expectedN, int lastN) throws IOException, InterruptedException {
NLineInputFormat format = new NLineInputFormat();
List<InputSplit> splits = format.getSplits(job);
int count = 0;
for (int i = 0; i < splits.size(); i++) {
assertEquals("There are no split locations", 0, splits.get(i).getLocations().length);
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
RecordReader<LongWritable, Text> reader = format.createRecordReader(splits.get(i), context);
Class<?> clazz = reader.getClass();
assertEquals("reader class is LineRecordReader.", LineRecordReader.class, clazz);
MapContext<LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl<LongWritable, Text, LongWritable, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), splits.get(i));
reader.initialize(splits.get(i), mcontext);
try {
count = 0;
while (reader.nextKeyValue()) {
count++;
}
} finally {
reader.close();
}
if (i == splits.size() - 1) {
assertEquals("number of lines in split(" + i + ") is wrong", lastN, count);
} else {
assertEquals("number of lines in split(" + i + ") is wrong", expectedN, count);
}
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class DateSplitter method split.
public List<InputSplit> split(Configuration conf, ResultSet results, String colName) throws SQLException {
long minVal;
long maxVal;
int sqlDataType = results.getMetaData().getColumnType(1);
minVal = resultSetColToLong(results, 1, sqlDataType);
maxVal = resultSetColToLong(results, 2, sqlDataType);
String lowClausePrefix = colName + " >= ";
String highClausePrefix = colName + " < ";
int numSplits = conf.getInt(MRJobConfig.NUM_MAPS, 1);
if (numSplits < 1) {
numSplits = 1;
}
if (minVal == Long.MIN_VALUE && maxVal == Long.MIN_VALUE) {
// The range of acceptable dates is NULL to NULL. Just create a single split.
List<InputSplit> splits = new ArrayList<InputSplit>();
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(colName + " IS NULL", colName + " IS NULL"));
return splits;
}
// Gather the split point integers
List<Long> splitPoints = split(numSplits, minVal, maxVal);
List<InputSplit> splits = new ArrayList<InputSplit>();
// Turn the split points into a set of intervals.
long start = splitPoints.get(0);
Date startDate = longToDate(start, sqlDataType);
if (sqlDataType == Types.TIMESTAMP) {
// The lower bound's nanos value needs to match the actual lower-bound nanos.
try {
((java.sql.Timestamp) startDate).setNanos(results.getTimestamp(1).getNanos());
} catch (NullPointerException npe) {
// If the lower bound was NULL, we'll get an NPE; just ignore it and don't set nanos.
}
}
for (int i = 1; i < splitPoints.size(); i++) {
long end = splitPoints.get(i);
Date endDate = longToDate(end, sqlDataType);
if (i == splitPoints.size() - 1) {
if (sqlDataType == Types.TIMESTAMP) {
// The upper bound's nanos value needs to match the actual upper-bound nanos.
try {
((java.sql.Timestamp) endDate).setNanos(results.getTimestamp(2).getNanos());
} catch (NullPointerException npe) {
// If the upper bound was NULL, we'll get an NPE; just ignore it and don't set nanos.
}
}
// This is the last one; use a closed interval.
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(lowClausePrefix + dateToString(startDate), colName + " <= " + dateToString(endDate)));
} else {
// Normal open-interval case.
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(lowClausePrefix + dateToString(startDate), highClausePrefix + dateToString(endDate)));
}
start = end;
startDate = endDate;
}
if (minVal == Long.MIN_VALUE || maxVal == Long.MIN_VALUE) {
// Add an extra split to handle the null case that we saw.
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(colName + " IS NULL", colName + " IS NULL"));
}
return splits;
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class CombineFileInputFormat method getSplits.
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
long minSizeNode = 0;
long minSizeRack = 0;
long maxSize = 0;
Configuration conf = job.getConfiguration();
// values that might have been specified in the config
if (minSplitSizeNode != 0) {
minSizeNode = minSplitSizeNode;
} else {
minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0);
}
if (minSplitSizeRack != 0) {
minSizeRack = minSplitSizeRack;
} else {
minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0);
}
if (maxSplitSize != 0) {
maxSize = maxSplitSize;
} else {
maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0);
// If maxSize is not configured, a single split will be generated per
// node.
}
if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize);
}
if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
throw new IOException("Minimum split size per rack " + minSizeRack + " cannot be larger than maximum split size " + maxSize);
}
if (minSizeRack != 0 && minSizeNode > minSizeRack) {
throw new IOException("Minimum split size per node " + minSizeNode + " cannot be larger than minimum split " + "size per rack " + minSizeRack);
}
// all the files in input set
List<FileStatus> stats = listStatus(job);
List<InputSplit> splits = new ArrayList<InputSplit>();
if (stats.size() == 0) {
return splits;
}
// from a single pool only.
for (MultiPathFilter onepool : pools) {
ArrayList<FileStatus> myPaths = new ArrayList<FileStatus>();
// add it to the output set
for (Iterator<FileStatus> iter = stats.iterator(); iter.hasNext(); ) {
FileStatus p = iter.next();
if (onepool.accept(p.getPath())) {
// add it to my output set
myPaths.add(p);
iter.remove();
}
}
// create splits for all files in this pool.
getMoreSplits(job, myPaths, maxSize, minSizeNode, minSizeRack, splits);
}
// create splits for all files that are not in any pool.
getMoreSplits(job, stats, maxSize, minSizeNode, minSizeRack, splits);
// free up rackToNodes map
rackToNodes.clear();
return splits;
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestSleepJob method testRandomLocation.
private void testRandomLocation(int locations, int njobs, UserGroupInformation ugi) throws Exception {
Configuration configuration = new Configuration();
DebugJobProducer jobProducer = new DebugJobProducer(njobs, configuration);
Configuration jconf = GridmixTestUtils.mrvl.getConfig();
jconf.setInt(JobCreator.SLEEPJOB_RANDOM_LOCATIONS, locations);
JobStory story;
int seq = 1;
while ((story = jobProducer.getNextJob()) != null) {
GridmixJob gridmixJob = JobCreator.SLEEPJOB.createGridmixJob(jconf, 0, story, new Path("ignored"), ugi, seq++);
gridmixJob.buildSplits(null);
List<InputSplit> splits = new SleepJob.SleepInputFormat().getSplits(gridmixJob.getJob());
for (InputSplit split : splits) {
assertEquals(locations, split.getLocations().length);
}
}
jobProducer.close();
}
Aggregations