use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.
the class DynamicInputFormat method createSplits.
private List<InputSplit> createSplits(JobContext jobContext, List<DynamicInputChunk> chunks) throws IOException {
int numMaps = getNumMapTasks(jobContext.getConfiguration());
final int nSplits = Math.min(numMaps, chunks.size());
List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);
for (int i = 0; i < nSplits; ++i) {
TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
chunks.get(i).assignTo(taskId);
splits.add(new FileSplit(chunks.get(i).getPath(), 0, // over.
getMinRecordsPerChunk(jobContext.getConfiguration()), null));
}
DistCpUtils.publish(jobContext.getConfiguration(), CONF_LABEL_NUM_SPLITS, splits.size());
return splits;
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.
the class TestDistCacheEmulation method validateSetupGenDC.
/**
* Validate setupGenerateDistCacheData by validating <li>permissions of the
* distributed cache directories and <li>content of the generated sequence
* file. This includes validation of dist cache file paths and their file
* sizes.
*/
private void validateSetupGenDC(Configuration jobConf, long[] sortedFileSizes) throws IOException, InterruptedException {
// build things needed for validation
long sumOfFileSizes = 0;
for (int i = 0; i < sortedFileSizes.length; i++) {
sumOfFileSizes += sortedFileSizes[i];
}
FileSystem fs = FileSystem.get(jobConf);
assertEquals("Number of distributed cache files to be generated is wrong.", sortedFileSizes.length, jobConf.getInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, -1));
assertEquals("Total size of dist cache files to be generated is wrong.", sumOfFileSizes, jobConf.getLong(GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, -1));
Path filesListFile = new Path(jobConf.get(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST));
FileStatus stat = fs.getFileStatus(filesListFile);
assertEquals("Wrong permissions of dist Cache files list file " + filesListFile, new FsPermission((short) 0644), stat.getPermission());
InputSplit split = new FileSplit(filesListFile, 0, stat.getLen(), (String[]) null);
TaskAttemptContext taskContext = MapReduceTestUtil.createDummyMapTaskAttemptContext(jobConf);
RecordReader<LongWritable, BytesWritable> reader = new GenerateDistCacheData.GenDCDataFormat().createRecordReader(split, taskContext);
MapContext<LongWritable, BytesWritable, NullWritable, BytesWritable> mapContext = new MapContextImpl<LongWritable, BytesWritable, NullWritable, BytesWritable>(jobConf, taskContext.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mapContext);
// start validating setupGenerateDistCacheData
doValidateSetupGenDC(reader, fs, sortedFileSizes);
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.
the class ZombieJob method getInputSplits.
@Override
public InputSplit[] getInputSplits() {
if (splits == null) {
List<InputSplit> splitsList = new ArrayList<InputSplit>();
Path emptyPath = new Path("/");
// use to determine avg # of hosts per split.
int totalHosts = 0;
for (LoggedTask mapTask : job.getMapTasks()) {
Pre21JobHistoryConstants.Values taskType = mapTask.getTaskType();
if (taskType != Pre21JobHistoryConstants.Values.MAP) {
LOG.warn("TaskType for a MapTask is not Map. task=" + mapTask.getTaskID() + " type=" + ((taskType == null) ? "null" : taskType.toString()));
continue;
}
List<LoggedLocation> locations = mapTask.getPreferredLocations();
List<String> hostList = new ArrayList<String>();
if (locations != null) {
for (LoggedLocation location : locations) {
List<NodeName> layers = location.getLayers();
if (layers.size() == 0) {
LOG.warn("Bad location layer format for task " + mapTask.getTaskID());
continue;
}
String host = layers.get(layers.size() - 1).getValue();
if (host == null) {
LOG.warn("Bad location layer format for task " + mapTask.getTaskID() + ": " + layers);
continue;
}
hostList.add(host);
}
}
String[] hosts = hostList.toArray(new String[hostList.size()]);
totalHosts += hosts.length;
long mapInputBytes = getTaskInfo(mapTask).getInputBytes();
if (mapInputBytes < 0) {
LOG.warn("InputBytes for task " + mapTask.getTaskID() + " is not defined.");
mapInputBytes = 0;
}
splitsList.add(new FileSplit(emptyPath, 0, mapInputBytes, hosts));
}
// If not all map tasks are in job trace, should make up some splits
// for missing map tasks.
int totalMaps = job.getTotalMaps();
if (totalMaps < splitsList.size()) {
LOG.warn("TotalMaps for job " + job.getJobID() + " is less than the total number of map task descriptions (" + totalMaps + "<" + splitsList.size() + ").");
}
int avgHostPerSplit;
if (splitsList.size() == 0) {
avgHostPerSplit = 3;
} else {
avgHostPerSplit = totalHosts / splitsList.size();
if (avgHostPerSplit == 0) {
avgHostPerSplit = 3;
}
}
for (int i = splitsList.size(); i < totalMaps; i++) {
if (cluster == null) {
splitsList.add(new FileSplit(emptyPath, 0, 0, new String[0]));
} else {
MachineNode[] mNodes = cluster.getRandomMachines(avgHostPerSplit, random);
String[] hosts = new String[mNodes.length];
for (int j = 0; j < hosts.length; ++j) {
hosts[j] = mNodes[j].getName();
}
// TODO set size of a split to 0 now.
splitsList.add(new FileSplit(emptyPath, 0, 0, hosts));
}
}
splits = splitsList.toArray(new InputSplit[splitsList.size()]);
}
return splits;
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project mongo-hadoop by mongodb.
the class BSONSplitter method writeSplits.
/**
* Write out the splits file, if doing so has been enabled. Splits must
* already have been calculated previously by a call to {@link
* #readSplitsForFile readSplitsForFile} or {@link #readSplits readSplits}.
*
* @see com.mongodb.hadoop.util.MongoConfigUtil#BSON_WRITE_SPLITS
*
* @throws IOException when an error occurs writing the file
*/
public void writeSplits() throws IOException {
if (getConf().getBoolean("bson.split.write_splits", true)) {
LOG.info("Writing splits to disk.");
} else {
LOG.info("bson.split.write_splits is set to false - skipping writing splits to disk.");
return;
}
if (splitsList == null) {
LOG.info("No splits found, skipping write of splits file.");
}
Path outputPath = getSplitsFilePath(inputPath, getConf());
FileSystem pathFileSystem = outputPath.getFileSystem(getConf());
FSDataOutputStream fsDataOut = null;
try {
fsDataOut = pathFileSystem.create(outputPath, false);
for (FileSplit inputSplit : splitsList) {
BSONObject splitObj = BasicDBObjectBuilder.start().add("s", inputSplit.getStart()).add("l", inputSplit.getLength()).get();
byte[] encodedObj = bsonEnc.encode(splitObj);
fsDataOut.write(encodedObj, 0, encodedObj.length);
}
} catch (IOException e) {
LOG.error("Could not create splits file: " + e.getMessage());
throw e;
} finally {
if (fsDataOut != null) {
fsDataOut.close();
}
}
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project mongo-hadoop by mongodb.
the class BSONFileInputFormat method getSplits.
@Override
public List<FileSplit> getSplits(final JobContext context) throws IOException {
Configuration config = context.getConfiguration();
PathFilter pf = getInputPathFilter(context);
BSONSplitter splitter = new BSONSplitter();
splitter.setConf(config);
ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
List<FileStatus> inputFiles = listStatus(context);
for (FileStatus file : inputFiles) {
if (pf != null && !pf.accept(file.getPath())) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("skipping file %s not matched path filter.", file.getPath()));
}
continue;
} else if (!isSplitable(context, file.getPath())) {
LOG.info("File " + file.getPath() + " is compressed so " + "cannot be split.");
splits.add(splitter.createFileSplit(file, FileSystem.get(file.getPath().toUri(), config), 0L, file.getLen()));
continue;
} else if (LOG.isDebugEnabled()) {
LOG.debug("processing file " + file.getPath());
}
splitter.setInputPath(file.getPath());
Path splitFilePath = getSplitsFilePath(file.getPath(), config);
try {
splitter.loadSplitsFromSplitFile(file, splitFilePath);
} catch (BSONSplitter.NoSplitFileException nsfe) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("No split file for %s; building split file", file.getPath()));
}
splitter.readSplitsForFile(file);
}
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
}
splits.addAll(splitter.getAllSplits());
}
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Total of %d found.", splits.size()));
}
return splits;
}
Aggregations