Examples with FileSplit - org.apache.hadoop.mapreduce.lib.input.FileSplit

Example 26 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.

the class DynamicInputFormat method createSplits.

private List<InputSplit> createSplits(JobContext jobContext, List<DynamicInputChunk> chunks) throws IOException {
    int numMaps = getNumMapTasks(jobContext.getConfiguration());
    final int nSplits = Math.min(numMaps, chunks.size());
    List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);
    for (int i = 0; i < nSplits; ++i) {
        TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
        chunks.get(i).assignTo(taskId);
        splits.add(new FileSplit(chunks.get(i).getPath(), 0, // over.
        getMinRecordsPerChunk(jobContext.getConfiguration()), null));
    }
    DistCpUtils.publish(jobContext.getConfiguration(), CONF_LABEL_NUM_SPLITS, splits.size());
    return splits;
}

Also used : ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit)

Example 27 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.

the class TestDistCacheEmulation method validateSetupGenDC.

/**
   * Validate setupGenerateDistCacheData by validating <li>permissions of the
   * distributed cache directories and <li>content of the generated sequence
   * file. This includes validation of dist cache file paths and their file
   * sizes.
   */
private void validateSetupGenDC(Configuration jobConf, long[] sortedFileSizes) throws IOException, InterruptedException {
    // build things needed for validation
    long sumOfFileSizes = 0;
    for (int i = 0; i < sortedFileSizes.length; i++) {
        sumOfFileSizes += sortedFileSizes[i];
    }
    FileSystem fs = FileSystem.get(jobConf);
    assertEquals("Number of distributed cache files to be generated is wrong.", sortedFileSizes.length, jobConf.getInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, -1));
    assertEquals("Total size of dist cache files to be generated is wrong.", sumOfFileSizes, jobConf.getLong(GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, -1));
    Path filesListFile = new Path(jobConf.get(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST));
    FileStatus stat = fs.getFileStatus(filesListFile);
    assertEquals("Wrong permissions of dist Cache files list file " + filesListFile, new FsPermission((short) 0644), stat.getPermission());
    InputSplit split = new FileSplit(filesListFile, 0, stat.getLen(), (String[]) null);
    TaskAttemptContext taskContext = MapReduceTestUtil.createDummyMapTaskAttemptContext(jobConf);
    RecordReader<LongWritable, BytesWritable> reader = new GenerateDistCacheData.GenDCDataFormat().createRecordReader(split, taskContext);
    MapContext<LongWritable, BytesWritable, NullWritable, BytesWritable> mapContext = new MapContextImpl<LongWritable, BytesWritable, NullWritable, BytesWritable>(jobConf, taskContext.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mapContext);
    // start validating setupGenerateDistCacheData
    doValidateSetupGenDC(reader, fs, sortedFileSizes);
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BytesWritable(org.apache.hadoop.io.BytesWritable) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) NullWritable(org.apache.hadoop.io.NullWritable) FileSystem(org.apache.hadoop.fs.FileSystem) FsPermission(org.apache.hadoop.fs.permission.FsPermission) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 28 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hadoop by apache.

the class ZombieJob method getInputSplits.

@Override
public InputSplit[] getInputSplits() {
    if (splits == null) {
        List<InputSplit> splitsList = new ArrayList<InputSplit>();
        Path emptyPath = new Path("/");
        // use to determine avg # of hosts per split.
        int totalHosts = 0;
        for (LoggedTask mapTask : job.getMapTasks()) {
            Pre21JobHistoryConstants.Values taskType = mapTask.getTaskType();
            if (taskType != Pre21JobHistoryConstants.Values.MAP) {
                LOG.warn("TaskType for a MapTask is not Map. task=" + mapTask.getTaskID() + " type=" + ((taskType == null) ? "null" : taskType.toString()));
                continue;
            }
            List<LoggedLocation> locations = mapTask.getPreferredLocations();
            List<String> hostList = new ArrayList<String>();
            if (locations != null) {
                for (LoggedLocation location : locations) {
                    List<NodeName> layers = location.getLayers();
                    if (layers.size() == 0) {
                        LOG.warn("Bad location layer format for task " + mapTask.getTaskID());
                        continue;
                    }
                    String host = layers.get(layers.size() - 1).getValue();
                    if (host == null) {
                        LOG.warn("Bad location layer format for task " + mapTask.getTaskID() + ": " + layers);
                        continue;
                    }
                    hostList.add(host);
                }
            }
            String[] hosts = hostList.toArray(new String[hostList.size()]);
            totalHosts += hosts.length;
            long mapInputBytes = getTaskInfo(mapTask).getInputBytes();
            if (mapInputBytes < 0) {
                LOG.warn("InputBytes for task " + mapTask.getTaskID() + " is not defined.");
                mapInputBytes = 0;
            }
            splitsList.add(new FileSplit(emptyPath, 0, mapInputBytes, hosts));
        }
        // If not all map tasks are in job trace, should make up some splits
        // for missing map tasks.
        int totalMaps = job.getTotalMaps();
        if (totalMaps < splitsList.size()) {
            LOG.warn("TotalMaps for job " + job.getJobID() + " is less than the total number of map task descriptions (" + totalMaps + "<" + splitsList.size() + ").");
        }
        int avgHostPerSplit;
        if (splitsList.size() == 0) {
            avgHostPerSplit = 3;
        } else {
            avgHostPerSplit = totalHosts / splitsList.size();
            if (avgHostPerSplit == 0) {
                avgHostPerSplit = 3;
            }
        }
        for (int i = splitsList.size(); i < totalMaps; i++) {
            if (cluster == null) {
                splitsList.add(new FileSplit(emptyPath, 0, 0, new String[0]));
            } else {
                MachineNode[] mNodes = cluster.getRandomMachines(avgHostPerSplit, random);
                String[] hosts = new String[mNodes.length];
                for (int j = 0; j < hosts.length; ++j) {
                    hosts[j] = mNodes[j].getName();
                }
                // TODO set size of a split to 0 now.
                splitsList.add(new FileSplit(emptyPath, 0, 0, hosts));
            }
        }
        splits = splitsList.toArray(new InputSplit[splitsList.size()]);
    }
    return splits;
}

Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) Values(org.apache.hadoop.tools.rumen.Pre21JobHistoryConstants.Values) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 29 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project mongo-hadoop by mongodb.

the class BSONSplitter method writeSplits.

/**
     * Write out the splits file, if doing so has been enabled. Splits must
     * already have been calculated previously by a call to {@link
     * #readSplitsForFile readSplitsForFile} or {@link #readSplits readSplits}.
     *
     * @see com.mongodb.hadoop.util.MongoConfigUtil#BSON_WRITE_SPLITS
     *
     * @throws IOException when an error occurs writing the file
     */
public void writeSplits() throws IOException {
    if (getConf().getBoolean("bson.split.write_splits", true)) {
        LOG.info("Writing splits to disk.");
    } else {
        LOG.info("bson.split.write_splits is set to false - skipping writing splits to disk.");
        return;
    }
    if (splitsList == null) {
        LOG.info("No splits found, skipping write of splits file.");
    }
    Path outputPath = getSplitsFilePath(inputPath, getConf());
    FileSystem pathFileSystem = outputPath.getFileSystem(getConf());
    FSDataOutputStream fsDataOut = null;
    try {
        fsDataOut = pathFileSystem.create(outputPath, false);
        for (FileSplit inputSplit : splitsList) {
            BSONObject splitObj = BasicDBObjectBuilder.start().add("s", inputSplit.getStart()).add("l", inputSplit.getLength()).get();
            byte[] encodedObj = bsonEnc.encode(splitObj);
            fsDataOut.write(encodedObj, 0, encodedObj.length);
        }
    } catch (IOException e) {
        LOG.error("Could not create splits file: " + e.getMessage());
        throw e;
    } finally {
        if (fsDataOut != null) {
            fsDataOut.close();
        }
    }
}

Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBSONObject(org.bson.LazyBSONObject) BSONObject(org.bson.BSONObject) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit)

Example 30 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project mongo-hadoop by mongodb.

the class BSONFileInputFormat method getSplits.

@Override
public List<FileSplit> getSplits(final JobContext context) throws IOException {
    Configuration config = context.getConfiguration();
    PathFilter pf = getInputPathFilter(context);
    BSONSplitter splitter = new BSONSplitter();
    splitter.setConf(config);
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
    List<FileStatus> inputFiles = listStatus(context);
    for (FileStatus file : inputFiles) {
        if (pf != null && !pf.accept(file.getPath())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("skipping file %s not matched path filter.", file.getPath()));
            }
            continue;
        } else if (!isSplitable(context, file.getPath())) {
            LOG.info("File " + file.getPath() + " is compressed so " + "cannot be split.");
            splits.add(splitter.createFileSplit(file, FileSystem.get(file.getPath().toUri(), config), 0L, file.getLen()));
            continue;
        } else if (LOG.isDebugEnabled()) {
            LOG.debug("processing file " + file.getPath());
        }
        splitter.setInputPath(file.getPath());
        Path splitFilePath = getSplitsFilePath(file.getPath(), config);
        try {
            splitter.loadSplitsFromSplitFile(file, splitFilePath);
        } catch (BSONSplitter.NoSplitFileException nsfe) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("No split file for %s; building split file", file.getPath()));
            }
            splitter.readSplitsForFile(file);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
        }
        splits.addAll(splitter.getAllSplits());
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug(String.format("Total of %d found.", splits.size()));
    }
    return splits;
}

Also used : BSONSplitter.getSplitsFilePath(com.mongodb.hadoop.splitter.BSONSplitter.getSplitsFilePath) Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) BSONSplitter(com.mongodb.hadoop.splitter.BSONSplitter) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit)

Aggregations

FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)39 Path (org.apache.hadoop.fs.Path)22 Configuration (org.apache.hadoop.conf.Configuration)13 InputSplit (org.apache.hadoop.mapreduce.InputSplit)12 IOException (java.io.IOException)10 ArrayList (java.util.ArrayList)10 FileSystem (org.apache.hadoop.fs.FileSystem)7 BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)4 Text (org.apache.hadoop.io.Text)3 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)3 BSONSplitter (com.mongodb.hadoop.splitter.BSONSplitter)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 File (java.io.File)2 Constructor (java.lang.reflect.Constructor)2 Schema (org.apache.avro.Schema)2 AvroKeyRecordReader (org.apache.avro.mapreduce.AvroKeyRecordReader)2 FileSplitPartitionQuery (org.apache.gora.query.impl.FileSplitPartitionQuery)2 FileStatus (org.apache.hadoop.fs.FileStatus)2