Search in sources :

Example 91 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project crunch by cloudera.

the class CrunchRecordReader method initialize.

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
    CrunchInputSplit crunchSplit = (CrunchInputSplit) inputSplit;
    InputSplit delegateSplit = crunchSplit.getInputSplit();
    delegate.initialize(delegateSplit, TaskAttemptContextFactory.create(crunchSplit.getConf(), context.getTaskAttemptID()));
}
Also used : InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 92 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project druid by druid-io.

the class DatasourceInputFormat method getSplits.

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    String segmentsStr = Preconditions.checkNotNull(conf.get(CONF_INPUT_SEGMENTS), "No segments found to read");
    List<WindowedDataSegment> segments = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr, new TypeReference<List<WindowedDataSegment>>() {
    });
    if (segments == null || segments.size() == 0) {
        throw new ISE("No segments found to read");
    }
    logger.info("segments to read [%s]", segmentsStr);
    long maxSize = conf.getLong(CONF_MAX_SPLIT_SIZE, 0);
    if (maxSize < 0) {
        long totalSize = 0;
        for (WindowedDataSegment segment : segments) {
            totalSize += segment.getSegment().getSize();
        }
        int mapTask = ((JobConf) conf).getNumMapTasks();
        if (mapTask > 0) {
            maxSize = totalSize / mapTask;
        }
    }
    if (maxSize > 0) {
        //combining is to happen, let us sort the segments list by size so that they
        //are combined appropriately
        Collections.sort(segments, new Comparator<WindowedDataSegment>() {

            @Override
            public int compare(WindowedDataSegment s1, WindowedDataSegment s2) {
                return Long.compare(s1.getSegment().getSize(), s2.getSegment().getSize());
            }
        });
    }
    List<InputSplit> splits = Lists.newArrayList();
    List<WindowedDataSegment> list = new ArrayList<>();
    long size = 0;
    JobConf dummyConf = new JobConf();
    org.apache.hadoop.mapred.InputFormat fio = supplier.get();
    for (WindowedDataSegment segment : segments) {
        if (size + segment.getSegment().getSize() > maxSize && size > 0) {
            splits.add(toDataSourceSplit(list, fio, dummyConf));
            list = Lists.newArrayList();
            size = 0;
        }
        list.add(segment);
        size += segment.getSegment().getSize();
    }
    if (list.size() > 0) {
        splits.add(toDataSourceSplit(list, fio, dummyConf));
    }
    logger.info("Number of splits [%d]", splits.size());
    return splits;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) ISE(io.druid.java.util.common.ISE) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 93 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project druid by druid-io.

the class DatasourceInputFormatTest method testGetSplitsUsingDefaultSupplier.

@Test
public void testGetSplitsUsingDefaultSupplier() throws Exception {
    // Use the builtin supplier, reading from the local filesystem, rather than testFormatter.
    final File tmpFile = temporaryFolder.newFile("something:with:colons");
    Files.write("dummy", tmpFile, Charsets.UTF_8);
    final ImmutableList<WindowedDataSegment> mySegments = ImmutableList.of(WindowedDataSegment.of(new DataSegment("test1", Interval.parse("2000/3000"), "ver", ImmutableMap.<String, Object>of("type", "local", "path", tmpFile.getPath()), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 2)));
    final JobConf myConfig = new JobConf();
    myConfig.set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, new DefaultObjectMapper().writeValueAsString(mySegments));
    final JobContext myContext = EasyMock.createMock(JobContext.class);
    EasyMock.expect(myContext.getConfiguration()).andReturn(myConfig);
    EasyMock.replay(myContext);
    final List<InputSplit> splits = new DatasourceInputFormat().getSplits(myContext);
    Assert.assertEquals(1, splits.size());
    final DatasourceInputSplit theSplit = (DatasourceInputSplit) Iterables.getOnlyElement(splits);
    Assert.assertEquals(mySegments.get(0).getSegment().getSize(), theSplit.getLength());
    Assert.assertEquals(mySegments, theSplit.getSegments());
    Assert.assertArrayEquals(new String[] { "localhost" }, theSplit.getLocations());
}
Also used : DefaultObjectMapper(io.druid.jackson.DefaultObjectMapper) JobContext(org.apache.hadoop.mapreduce.JobContext) File(java.io.File) DataSegment(io.druid.timeline.DataSegment) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 94 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class FileInputFormat method getSplits.

/** 
   * Generate the list of files and make them into FileSplits.
   * @param job the job context
   * @throws IOException
   */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    StopWatch sw = new StopWatch().start();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);
                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }
                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else {
                // not splitable
                if (LOG.isDebugEnabled()) {
                    // Log only if the file is big enough to be splitted
                    if (length > Math.min(file.getBlockSize(), minSize)) {
                        LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
                    }
                }
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) BlockLocation(org.apache.hadoop.fs.BlockLocation) StopWatch(org.apache.hadoop.util.StopWatch) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 95 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class NLineInputFormat method getSplits.

/** 
   * Logically splits the set of input files for the job, splits N lines
   * of the input as one split.
   * 
   * @see FileInputFormat#getSplits(JobContext)
   */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    int numLinesPerSplit = getNumLinesPerSplit(job);
    for (FileStatus status : listStatus(job)) {
        splits.addAll(getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit));
    }
    return splits;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9