use of org.apache.hadoop.mapreduce.InputSplit in project crunch by cloudera.
the class CrunchRecordReader method initialize.
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
CrunchInputSplit crunchSplit = (CrunchInputSplit) inputSplit;
InputSplit delegateSplit = crunchSplit.getInputSplit();
delegate.initialize(delegateSplit, TaskAttemptContextFactory.create(crunchSplit.getConf(), context.getTaskAttemptID()));
}
use of org.apache.hadoop.mapreduce.InputSplit in project druid by druid-io.
the class DatasourceInputFormat method getSplits.
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String segmentsStr = Preconditions.checkNotNull(conf.get(CONF_INPUT_SEGMENTS), "No segments found to read");
List<WindowedDataSegment> segments = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr, new TypeReference<List<WindowedDataSegment>>() {
});
if (segments == null || segments.size() == 0) {
throw new ISE("No segments found to read");
}
logger.info("segments to read [%s]", segmentsStr);
long maxSize = conf.getLong(CONF_MAX_SPLIT_SIZE, 0);
if (maxSize < 0) {
long totalSize = 0;
for (WindowedDataSegment segment : segments) {
totalSize += segment.getSegment().getSize();
}
int mapTask = ((JobConf) conf).getNumMapTasks();
if (mapTask > 0) {
maxSize = totalSize / mapTask;
}
}
if (maxSize > 0) {
//combining is to happen, let us sort the segments list by size so that they
//are combined appropriately
Collections.sort(segments, new Comparator<WindowedDataSegment>() {
@Override
public int compare(WindowedDataSegment s1, WindowedDataSegment s2) {
return Long.compare(s1.getSegment().getSize(), s2.getSegment().getSize());
}
});
}
List<InputSplit> splits = Lists.newArrayList();
List<WindowedDataSegment> list = new ArrayList<>();
long size = 0;
JobConf dummyConf = new JobConf();
org.apache.hadoop.mapred.InputFormat fio = supplier.get();
for (WindowedDataSegment segment : segments) {
if (size + segment.getSegment().getSize() > maxSize && size > 0) {
splits.add(toDataSourceSplit(list, fio, dummyConf));
list = Lists.newArrayList();
size = 0;
}
list.add(segment);
size += segment.getSegment().getSize();
}
if (list.size() > 0) {
splits.add(toDataSourceSplit(list, fio, dummyConf));
}
logger.info("Number of splits [%d]", splits.size());
return splits;
}
use of org.apache.hadoop.mapreduce.InputSplit in project druid by druid-io.
the class DatasourceInputFormatTest method testGetSplitsUsingDefaultSupplier.
@Test
public void testGetSplitsUsingDefaultSupplier() throws Exception {
// Use the builtin supplier, reading from the local filesystem, rather than testFormatter.
final File tmpFile = temporaryFolder.newFile("something:with:colons");
Files.write("dummy", tmpFile, Charsets.UTF_8);
final ImmutableList<WindowedDataSegment> mySegments = ImmutableList.of(WindowedDataSegment.of(new DataSegment("test1", Interval.parse("2000/3000"), "ver", ImmutableMap.<String, Object>of("type", "local", "path", tmpFile.getPath()), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), NoneShardSpec.instance(), 9, 2)));
final JobConf myConfig = new JobConf();
myConfig.set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, new DefaultObjectMapper().writeValueAsString(mySegments));
final JobContext myContext = EasyMock.createMock(JobContext.class);
EasyMock.expect(myContext.getConfiguration()).andReturn(myConfig);
EasyMock.replay(myContext);
final List<InputSplit> splits = new DatasourceInputFormat().getSplits(myContext);
Assert.assertEquals(1, splits.size());
final DatasourceInputSplit theSplit = (DatasourceInputSplit) Iterables.getOnlyElement(splits);
Assert.assertEquals(mySegments.get(0).getSegment().getSize(), theSplit.getLength());
Assert.assertEquals(mySegments, theSplit.getSegments());
Assert.assertArrayEquals(new String[] { "localhost" }, theSplit.getLocations());
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class FileInputFormat method getSplits.
/**
* Generate the list of files and make them into FileSplits.
* @param job the job context
* @throws IOException
*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
StopWatch sw = new StopWatch().start();
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
// generate splits
List<InputSplit> splits = new ArrayList<InputSplit>();
List<FileStatus> files = listStatus(job);
for (FileStatus file : files) {
Path path = file.getPath();
long length = file.getLen();
if (length != 0) {
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
FileSystem fs = path.getFileSystem(job.getConfiguration());
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
if (isSplitable(job, path)) {
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize);
long bytesRemaining = length;
while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
bytesRemaining -= splitSize;
}
if (bytesRemaining != 0) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
}
} else {
// not splitable
if (LOG.isDebugEnabled()) {
// Log only if the file is big enough to be splitted
if (length > Math.min(file.getBlockSize(), minSize)) {
LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
}
}
splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts()));
}
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
// Save the number of input files for metrics/loadgen
job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
}
return splits;
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class NLineInputFormat method getSplits.
/**
* Logically splits the set of input files for the job, splits N lines
* of the input as one split.
*
* @see FileInputFormat#getSplits(JobContext)
*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
List<InputSplit> splits = new ArrayList<InputSplit>();
int numLinesPerSplit = getNumLinesPerSplit(job);
for (FileStatus status : listStatus(job)) {
splits.addAll(getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit));
}
return splits;
}
Aggregations