Search in sources :

Example 26 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.

the class DynamicInputFormat method createSplits.

private List<InputSplit> createSplits(JobContext jobContext, List<DynamicInputChunk> chunks) throws IOException {
    int numMaps = getNumMapTasks(jobContext.getConfiguration());
    final int nSplits = Math.min(numMaps, chunks.size());
    List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);
    for (int i = 0; i < nSplits; ++i) {
        TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
        splits.add(new FileSplit(chunks.get(i).getPath(), 0, // over.
        getMinRecordsPerChunk(jobContext.getConfiguration()), null));
    DistCpUtils.publish(jobContext.getConfiguration(), CONF_LABEL_NUM_SPLITS, splits.size());
    return splits;
Also used : ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit)

Example 27 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.

the class TestDistCacheEmulation method validateSetupGenDC.

   * Validate setupGenerateDistCacheData by validating <li>permissions of the
   * distributed cache directories and <li>content of the generated sequence
   * file. This includes validation of dist cache file paths and their file
   * sizes.
private void validateSetupGenDC(Configuration jobConf, long[] sortedFileSizes) throws IOException, InterruptedException {
    // build things needed for validation
    long sumOfFileSizes = 0;
    for (int i = 0; i < sortedFileSizes.length; i++) {
        sumOfFileSizes += sortedFileSizes[i];
    FileSystem fs = FileSystem.get(jobConf);
    assertEquals("Number of distributed cache files to be generated is wrong.", sortedFileSizes.length, jobConf.getInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, -1));
    assertEquals("Total size of dist cache files to be generated is wrong.", sumOfFileSizes, jobConf.getLong(GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, -1));
    Path filesListFile = new Path(jobConf.get(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST));
    FileStatus stat = fs.getFileStatus(filesListFile);
    assertEquals("Wrong permissions of dist Cache files list file " + filesListFile, new FsPermission((short) 0644), stat.getPermission());
    InputSplit split = new FileSplit(filesListFile, 0, stat.getLen(), (String[]) null);
    TaskAttemptContext taskContext = MapReduceTestUtil.createDummyMapTaskAttemptContext(jobConf);
    RecordReader<LongWritable, BytesWritable> reader = new GenerateDistCacheData.GenDCDataFormat().createRecordReader(split, taskContext);
    MapContext<LongWritable, BytesWritable, NullWritable, BytesWritable> mapContext = new MapContextImpl<LongWritable, BytesWritable, NullWritable, BytesWritable>(jobConf, taskContext.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mapContext);
    // start validating setupGenerateDistCacheData
    doValidateSetupGenDC(reader, fs, sortedFileSizes);
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BytesWritable( FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) NullWritable( FileSystem(org.apache.hadoop.fs.FileSystem) FsPermission(org.apache.hadoop.fs.permission.FsPermission) LongWritable( InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 28 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.

the class ZombieJob method getInputSplits.

public InputSplit[] getInputSplits() {
    if (splits == null) {
        List<InputSplit> splitsList = new ArrayList<InputSplit>();
        Path emptyPath = new Path("/");
        // use to determine avg # of hosts per split.
        int totalHosts = 0;
        for (LoggedTask mapTask : job.getMapTasks()) {
            Pre21JobHistoryConstants.Values taskType = mapTask.getTaskType();
            if (taskType != Pre21JobHistoryConstants.Values.MAP) {
                LOG.warn("TaskType for a MapTask is not Map. task=" + mapTask.getTaskID() + " type=" + ((taskType == null) ? "null" : taskType.toString()));
            List<LoggedLocation> locations = mapTask.getPreferredLocations();
            List<String> hostList = new ArrayList<String>();
            if (locations != null) {
                for (LoggedLocation location : locations) {
                    List<NodeName> layers = location.getLayers();
                    if (layers.size() == 0) {
                        LOG.warn("Bad location layer format for task " + mapTask.getTaskID());
                    String host = layers.get(layers.size() - 1).getValue();
                    if (host == null) {
                        LOG.warn("Bad location layer format for task " + mapTask.getTaskID() + ": " + layers);
            String[] hosts = hostList.toArray(new String[hostList.size()]);
            totalHosts += hosts.length;
            long mapInputBytes = getTaskInfo(mapTask).getInputBytes();
            if (mapInputBytes < 0) {
                LOG.warn("InputBytes for task " + mapTask.getTaskID() + " is not defined.");
                mapInputBytes = 0;
            splitsList.add(new FileSplit(emptyPath, 0, mapInputBytes, hosts));
        // If not all map tasks are in job trace, should make up some splits
        // for missing map tasks.
        int totalMaps = job.getTotalMaps();
        if (totalMaps < splitsList.size()) {
            LOG.warn("TotalMaps for job " + job.getJobID() + " is less than the total number of map task descriptions (" + totalMaps + "<" + splitsList.size() + ").");
        int avgHostPerSplit;
        if (splitsList.size() == 0) {
            avgHostPerSplit = 3;
        } else {
            avgHostPerSplit = totalHosts / splitsList.size();
            if (avgHostPerSplit == 0) {
                avgHostPerSplit = 3;
        for (int i = splitsList.size(); i < totalMaps; i++) {
            if (cluster == null) {
                splitsList.add(new FileSplit(emptyPath, 0, 0, new String[0]));
            } else {
                MachineNode[] mNodes = cluster.getRandomMachines(avgHostPerSplit, random);
                String[] hosts = new String[mNodes.length];
                for (int j = 0; j < hosts.length; ++j) {
                    hosts[j] = mNodes[j].getName();
                // TODO set size of a split to 0 now.
                splitsList.add(new FileSplit(emptyPath, 0, 0, hosts));
        splits = splitsList.toArray(new InputSplit[splitsList.size()]);
    return splits;
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) Values( InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 29 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project mongo-hadoop by mongodb.

the class BSONSplitter method writeSplits.

     * Write out the splits file, if doing so has been enabled. Splits must
     * already have been calculated previously by a call to {@link
     * #readSplitsForFile readSplitsForFile} or {@link #readSplits readSplits}.
     * @see com.mongodb.hadoop.util.MongoConfigUtil#BSON_WRITE_SPLITS
     * @throws IOException when an error occurs writing the file
public void writeSplits() throws IOException {
    if (getConf().getBoolean("bson.split.write_splits", true)) {"Writing splits to disk.");
    } else {"bson.split.write_splits is set to false - skipping writing splits to disk.");
    if (splitsList == null) {"No splits found, skipping write of splits file.");
    Path outputPath = getSplitsFilePath(inputPath, getConf());
    FileSystem pathFileSystem = outputPath.getFileSystem(getConf());
    FSDataOutputStream fsDataOut = null;
    try {
        fsDataOut = pathFileSystem.create(outputPath, false);
        for (FileSplit inputSplit : splitsList) {
            BSONObject splitObj = BasicDBObjectBuilder.start().add("s", inputSplit.getStart()).add("l", inputSplit.getLength()).get();
            byte[] encodedObj = bsonEnc.encode(splitObj);
            fsDataOut.write(encodedObj, 0, encodedObj.length);
    } catch (IOException e) {
        LOG.error("Could not create splits file: " + e.getMessage());
        throw e;
    } finally {
        if (fsDataOut != null) {
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBSONObject(org.bson.LazyBSONObject) BSONObject(org.bson.BSONObject) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) IOException( FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit)

Example 30 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project mongo-hadoop by mongodb.

the class BSONFileInputFormat method getSplits.

public List<FileSplit> getSplits(final JobContext context) throws IOException {
    Configuration config = context.getConfiguration();
    PathFilter pf = getInputPathFilter(context);
    BSONSplitter splitter = new BSONSplitter();
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
    List<FileStatus> inputFiles = listStatus(context);
    for (FileStatus file : inputFiles) {
        if (pf != null && !pf.accept(file.getPath())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("skipping file %s not matched path filter.", file.getPath()));
        } else if (!isSplitable(context, file.getPath())) {
  "File " + file.getPath() + " is compressed so " + "cannot be split.");
            splits.add(splitter.createFileSplit(file, FileSystem.get(file.getPath().toUri(), config), 0L, file.getLen()));
        } else if (LOG.isDebugEnabled()) {
            LOG.debug("processing file " + file.getPath());
        Path splitFilePath = getSplitsFilePath(file.getPath(), config);
        try {
            splitter.loadSplitsFromSplitFile(file, splitFilePath);
        } catch (BSONSplitter.NoSplitFileException nsfe) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("No split file for %s; building split file", file.getPath()));
        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
    if (LOG.isDebugEnabled()) {
        LOG.debug(String.format("Total of %d found.", splits.size()));
    return splits;
Also used : BSONSplitter.getSplitsFilePath(com.mongodb.hadoop.splitter.BSONSplitter.getSplitsFilePath) Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) BSONSplitter(com.mongodb.hadoop.splitter.BSONSplitter) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit)


FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)39 Path (org.apache.hadoop.fs.Path)22 Configuration (org.apache.hadoop.conf.Configuration)13 InputSplit (org.apache.hadoop.mapreduce.InputSplit)12 IOException ( ArrayList (java.util.ArrayList)10 FileSystem (org.apache.hadoop.fs.FileSystem)7 BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)4 Text ( NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)3 BSONSplitter (com.mongodb.hadoop.splitter.BSONSplitter)2 ByteArrayInputStream ( File ( Constructor (java.lang.reflect.Constructor)2 Schema (org.apache.avro.Schema)2 AvroKeyRecordReader (org.apache.avro.mapreduce.AvroKeyRecordReader)2 FileSplitPartitionQuery (org.apache.gora.query.impl.FileSplitPartitionQuery)2 FileStatus (org.apache.hadoop.fs.FileStatus)2