use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.
the class BSONFileInputFormat method getSplits.
@Override
public FileSplit[] getSplits(final JobConf job, final int numSplits) throws IOException {
BSONSplitter splitter = new BSONSplitter();
splitter.setConf(job);
FileStatus[] inputFiles = listStatus(job);
List<FileSplit> results = new ArrayList<FileSplit>();
for (FileStatus file : inputFiles) {
FileSystem fs = FileSystem.get(file.getPath().toUri(), job);
if (!isSplitable(fs, file.getPath())) {
LOG.info("File " + file.getPath() + " is compressed so " + "cannot be split.");
org.apache.hadoop.mapreduce.lib.input.FileSplit delegate = splitter.createFileSplit(file, fs, 0L, file.getLen());
results.add(new BSONFileSplit(delegate.getPath(), delegate.getStart(), delegate.getLength(), delegate.getLocations()));
continue;
}
splitter.setInputPath(file.getPath());
Path splitFilePath = getSplitsFilePath(file.getPath(), job);
try {
splitter.loadSplitsFromSplitFile(file, splitFilePath);
} catch (BSONSplitter.NoSplitFileException nsfe) {
if (LOG.isDebugEnabled()) {
LOG.debug(format("No split file for %s; building split file", file.getPath()));
}
splitter.readSplitsForFile(file);
}
if (LOG.isDebugEnabled()) {
LOG.debug(format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
}
for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : splitter.getAllSplits()) {
BSONFileSplit fsplit = new BSONFileSplit(split.getPath(), split.getStart(), split.getLength(), split.getLocations());
fsplit.setKeyField(MongoConfigUtil.getInputKey(job));
results.add(fsplit);
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(format("Total of %d found.", results.size()));
}
return results.toArray(new BSONFileSplit[results.size()]);
}
use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.
the class HiveMongoInputFormat method getSplits.
@Override
public FileSplit[] getSplits(final JobConf conf, final int numSplits) throws IOException {
try {
MongoSplitter splitterImpl = MongoSplitterFactory.getSplitter(conf);
final List<org.apache.hadoop.mapreduce.InputSplit> splits = splitterImpl.calculateSplits();
InputSplit[] splitIns = splits.toArray(new InputSplit[splits.size()]);
// wrap InputSplits in FileSplits so that 'getPath'
// doesn't produce an error (Hive bug)
FileSplit[] wrappers = new FileSplit[splitIns.length];
Path path = new Path(conf.get(MongoStorageHandler.TABLE_LOCATION));
for (int i = 0; i < wrappers.length; i++) {
wrappers[i] = new MongoHiveInputSplit(splitIns[i], path);
}
return wrappers;
} catch (SplitFailedException spfe) {
// split failed because no namespace found
// (so the corresponding collection doesn't exist)
LOG.error(spfe.getMessage(), spfe);
throw new IOException(spfe.getMessage(), spfe);
} catch (Exception e) {
throw new IOException(e);
}
}
use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.
the class BSONFileInputFormatTest method enronEmails.
@Test
public void enronEmails() throws IOException {
BSONFileInputFormat inputFormat = new BSONFileInputFormat();
JobConf job = new JobConf();
String inputDirectory = new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson").getAbsoluteFile().toURI().toString();
// Hadoop 2.X
job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
// Hadoop 1.2.X
job.set("mapred.input.dir", inputDirectory);
FileSplit[] splits = inputFormat.getSplits(job, 5);
int count = 0;
BSONWritable writable = new BSONWritable();
for (FileSplit split : splits) {
RecordReader<NullWritable, BSONWritable> recordReader = inputFormat.getRecordReader(split, job, null);
while (recordReader.next(null, writable)) {
count++;
}
}
assertEquals("There are 501513 messages in the enron corpus", 501513, count);
}
use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.
the class BSONFileInputFormatTest method enronEmails.
@Test
public void enronEmails() throws IOException {
BSONFileInputFormat inputFormat = new BSONFileInputFormat();
JobConf job = new JobConf();
String inputDirectory = new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson").getAbsoluteFile().toURI().toString();
// Hadoop 2.X
job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
// Hadoop 1.2.X
job.set("mapred.input.dir", inputDirectory);
FileSplit[] splits = inputFormat.getSplits(job, 5);
int count = 0;
BSONWritable writable = new BSONWritable();
for (FileSplit split : splits) {
RecordReader<NullWritable, BSONWritable> recordReader = inputFormat.getRecordReader(split, job, null);
while (recordReader.next(null, writable)) {
count++;
}
}
assertEquals("There are 501513 messages in the enron corpus", 501513, count);
}
use of org.apache.hadoop.mapred.FileSplit in project drill by apache.
the class HiveMetadataProvider method transformFileSplits.
/**
* <p>
* Groups input splits by file path. Each inout split group is ordered by starting bytes
* to ensure file parts in correct order.
* </p>
* <p>
* Example:
* <pre>
* hdfs:///tmp/table/file_1.txt -> hdfs:///tmp/table/file_1.txt:0+10000
* hdfs:///tmp/table/file_1.txt:10001+20000
* hdfs:///tmp/table/file_2.txt -> hdfs:///tmp/table/file_2.txt:0+10000
* </pre>
* </p>
* @param inputSplits input splits
* @return multimap where key is file path and value is group of ordered file splits
*/
private Multimap<Path, FileSplit> transformFileSplits(InputSplit[] inputSplits) {
Multimap<Path, FileSplit> inputSplitGroups = TreeMultimap.create(Ordering.natural(), Comparator.comparingLong(FileSplit::getStart));
for (InputSplit inputSplit : inputSplits) {
FileSplit fileSplit = (FileSplit) inputSplit;
inputSplitGroups.put(fileSplit.getPath(), fileSplit);
}
return inputSplitGroups;
}
Aggregations