use of com.mongodb.hadoop.splitter.BSONSplitter in project mongo-hadoop by mongodb.
the class BSONFileInputFormat method getSplits.
@Override
public FileSplit[] getSplits(final JobConf job, final int numSplits) throws IOException {
BSONSplitter splitter = new BSONSplitter();
splitter.setConf(job);
FileStatus[] inputFiles = listStatus(job);
List<FileSplit> results = new ArrayList<FileSplit>();
for (FileStatus file : inputFiles) {
FileSystem fs = FileSystem.get(file.getPath().toUri(), job);
if (!isSplitable(fs, file.getPath())) {
LOG.info("File " + file.getPath() + " is compressed so " + "cannot be split.");
org.apache.hadoop.mapreduce.lib.input.FileSplit delegate = splitter.createFileSplit(file, fs, 0L, file.getLen());
results.add(new BSONFileSplit(delegate.getPath(), delegate.getStart(), delegate.getLength(), delegate.getLocations()));
continue;
}
splitter.setInputPath(file.getPath());
Path splitFilePath = getSplitsFilePath(file.getPath(), job);
try {
splitter.loadSplitsFromSplitFile(file, splitFilePath);
} catch (BSONSplitter.NoSplitFileException nsfe) {
if (LOG.isDebugEnabled()) {
LOG.debug(format("No split file for %s; building split file", file.getPath()));
}
splitter.readSplitsForFile(file);
}
if (LOG.isDebugEnabled()) {
LOG.debug(format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
}
for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : splitter.getAllSplits()) {
BSONFileSplit fsplit = new BSONFileSplit(split.getPath(), split.getStart(), split.getLength(), split.getLocations());
fsplit.setKeyField(MongoConfigUtil.getInputKey(job));
results.add(fsplit);
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(format("Total of %d found.", results.size()));
}
return results.toArray(new BSONFileSplit[results.size()]);
}
use of com.mongodb.hadoop.splitter.BSONSplitter in project mongo-hadoop by mongodb.
the class BSONFileInputFormat method getRecordReader.
@Override
public RecordReader<NullWritable, BSONWritable> getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
if (split instanceof BSONFileSplit || !isSplitable(fs, fileSplit.getPath())) {
BSONFileRecordReader reader = new BSONFileRecordReader();
reader.initialize(split, job);
return reader;
}
// Split was not created by BSONSplitter.
BSONSplitter splitter = new BSONSplitter();
splitter.setConf(job);
splitter.setInputPath(fileSplit.getPath());
org.apache.hadoop.mapreduce.lib.input.FileSplit newStyleFileSplit = new org.apache.hadoop.mapreduce.lib.input.FileSplit(fileSplit.getPath(), fileSplit.getStart(), fileSplit.getLength(), fileSplit.getLocations());
long start = splitter.getStartingPositionForSplit(newStyleFileSplit);
BSONFileRecordReader reader = new BSONFileRecordReader(start);
reader.initialize(fileSplit, job);
return reader;
}
use of com.mongodb.hadoop.splitter.BSONSplitter in project mongo-hadoop by mongodb.
the class BSONFileInputFormat method getSplits.
@Override
public List<FileSplit> getSplits(final JobContext context) throws IOException {
Configuration config = context.getConfiguration();
PathFilter pf = getInputPathFilter(context);
BSONSplitter splitter = new BSONSplitter();
splitter.setConf(config);
ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
List<FileStatus> inputFiles = listStatus(context);
for (FileStatus file : inputFiles) {
if (pf != null && !pf.accept(file.getPath())) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("skipping file %s not matched path filter.", file.getPath()));
}
continue;
} else if (!isSplitable(context, file.getPath())) {
LOG.info("File " + file.getPath() + " is compressed so " + "cannot be split.");
splits.add(splitter.createFileSplit(file, FileSystem.get(file.getPath().toUri(), config), 0L, file.getLen()));
continue;
} else if (LOG.isDebugEnabled()) {
LOG.debug("processing file " + file.getPath());
}
splitter.setInputPath(file.getPath());
Path splitFilePath = getSplitsFilePath(file.getPath(), config);
try {
splitter.loadSplitsFromSplitFile(file, splitFilePath);
} catch (BSONSplitter.NoSplitFileException nsfe) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("No split file for %s; building split file", file.getPath()));
}
splitter.readSplitsForFile(file);
}
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
}
splits.addAll(splitter.getAllSplits());
}
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Total of %d found.", splits.size()));
}
return splits;
}
use of com.mongodb.hadoop.splitter.BSONSplitter in project mongo-hadoop by mongodb.
the class BSONFileInputFormat method createRecordReader.
@Override
public RecordReader createRecordReader(final InputSplit split, final TaskAttemptContext context) throws IOException, InterruptedException {
if (split instanceof BSONFileSplit) {
// Split was created by BSONSplitter and starts at a whole document.
return new BSONFileRecordReader();
}
// Split was not created by BSONSplitter, and we need to find the
// first document to begin iterating.
FileSplit fileSplit = (FileSplit) split;
BSONSplitter splitter = new BSONSplitter();
splitter.setConf(context.getConfiguration());
splitter.setInputPath(fileSplit.getPath());
return new BSONFileRecordReader(splitter.getStartingPositionForSplit(fileSplit));
}
Aggregations