Search in sources :

Example 1 with BSONSplitter

use of com.mongodb.hadoop.splitter.BSONSplitter in project mongo-hadoop by mongodb.

the class BSONFileInputFormat method getSplits.

@Override
public FileSplit[] getSplits(final JobConf job, final int numSplits) throws IOException {
    BSONSplitter splitter = new BSONSplitter();
    splitter.setConf(job);
    FileStatus[] inputFiles = listStatus(job);
    List<FileSplit> results = new ArrayList<FileSplit>();
    for (FileStatus file : inputFiles) {
        FileSystem fs = FileSystem.get(file.getPath().toUri(), job);
        if (!isSplitable(fs, file.getPath())) {
            LOG.info("File " + file.getPath() + " is compressed so " + "cannot be split.");
            org.apache.hadoop.mapreduce.lib.input.FileSplit delegate = splitter.createFileSplit(file, fs, 0L, file.getLen());
            results.add(new BSONFileSplit(delegate.getPath(), delegate.getStart(), delegate.getLength(), delegate.getLocations()));
            continue;
        }
        splitter.setInputPath(file.getPath());
        Path splitFilePath = getSplitsFilePath(file.getPath(), job);
        try {
            splitter.loadSplitsFromSplitFile(file, splitFilePath);
        } catch (BSONSplitter.NoSplitFileException nsfe) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(format("No split file for %s; building split file", file.getPath()));
            }
            splitter.readSplitsForFile(file);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug(format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
        }
        for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : splitter.getAllSplits()) {
            BSONFileSplit fsplit = new BSONFileSplit(split.getPath(), split.getStart(), split.getLength(), split.getLocations());
            fsplit.setKeyField(MongoConfigUtil.getInputKey(job));
            results.add(fsplit);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug(format("Total of %d found.", results.size()));
    }
    return results.toArray(new BSONFileSplit[results.size()]);
}
Also used : BSONSplitter.getSplitsFilePath(com.mongodb.hadoop.splitter.BSONSplitter.getSplitsFilePath) Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) BSONFileSplit(com.mongodb.hadoop.mapred.input.BSONFileSplit) BSONSplitter(com.mongodb.hadoop.splitter.BSONSplitter) BSONFileSplit(com.mongodb.hadoop.mapred.input.BSONFileSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 2 with BSONSplitter

use of com.mongodb.hadoop.splitter.BSONSplitter in project mongo-hadoop by mongodb.

the class BSONFileInputFormat method getRecordReader.

@Override
public RecordReader<NullWritable, BSONWritable> getRecordReader(final InputSplit split, final JobConf job, final Reporter reporter) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
    if (split instanceof BSONFileSplit || !isSplitable(fs, fileSplit.getPath())) {
        BSONFileRecordReader reader = new BSONFileRecordReader();
        reader.initialize(split, job);
        return reader;
    }
    // Split was not created by BSONSplitter.
    BSONSplitter splitter = new BSONSplitter();
    splitter.setConf(job);
    splitter.setInputPath(fileSplit.getPath());
    org.apache.hadoop.mapreduce.lib.input.FileSplit newStyleFileSplit = new org.apache.hadoop.mapreduce.lib.input.FileSplit(fileSplit.getPath(), fileSplit.getStart(), fileSplit.getLength(), fileSplit.getLocations());
    long start = splitter.getStartingPositionForSplit(newStyleFileSplit);
    BSONFileRecordReader reader = new BSONFileRecordReader(start);
    reader.initialize(fileSplit, job);
    return reader;
}
Also used : BSONFileSplit(com.mongodb.hadoop.mapred.input.BSONFileSplit) BSONFileSplit(com.mongodb.hadoop.mapred.input.BSONFileSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) BSONSplitter(com.mongodb.hadoop.splitter.BSONSplitter) BSONFileRecordReader(com.mongodb.hadoop.mapred.input.BSONFileRecordReader) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 3 with BSONSplitter

use of com.mongodb.hadoop.splitter.BSONSplitter in project mongo-hadoop by mongodb.

the class BSONFileInputFormat method getSplits.

@Override
public List<FileSplit> getSplits(final JobContext context) throws IOException {
    Configuration config = context.getConfiguration();
    PathFilter pf = getInputPathFilter(context);
    BSONSplitter splitter = new BSONSplitter();
    splitter.setConf(config);
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
    List<FileStatus> inputFiles = listStatus(context);
    for (FileStatus file : inputFiles) {
        if (pf != null && !pf.accept(file.getPath())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("skipping file %s not matched path filter.", file.getPath()));
            }
            continue;
        } else if (!isSplitable(context, file.getPath())) {
            LOG.info("File " + file.getPath() + " is compressed so " + "cannot be split.");
            splits.add(splitter.createFileSplit(file, FileSystem.get(file.getPath().toUri(), config), 0L, file.getLen()));
            continue;
        } else if (LOG.isDebugEnabled()) {
            LOG.debug("processing file " + file.getPath());
        }
        splitter.setInputPath(file.getPath());
        Path splitFilePath = getSplitsFilePath(file.getPath(), config);
        try {
            splitter.loadSplitsFromSplitFile(file, splitFilePath);
        } catch (BSONSplitter.NoSplitFileException nsfe) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("No split file for %s; building split file", file.getPath()));
            }
            splitter.readSplitsForFile(file);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
        }
        splits.addAll(splitter.getAllSplits());
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug(String.format("Total of %d found.", splits.size()));
    }
    return splits;
}
Also used : BSONSplitter.getSplitsFilePath(com.mongodb.hadoop.splitter.BSONSplitter.getSplitsFilePath) Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) BSONSplitter(com.mongodb.hadoop.splitter.BSONSplitter) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit)

Example 4 with BSONSplitter

use of com.mongodb.hadoop.splitter.BSONSplitter in project mongo-hadoop by mongodb.

the class BSONFileInputFormat method createRecordReader.

@Override
public RecordReader createRecordReader(final InputSplit split, final TaskAttemptContext context) throws IOException, InterruptedException {
    if (split instanceof BSONFileSplit) {
        // Split was created by BSONSplitter and starts at a whole document.
        return new BSONFileRecordReader();
    }
    // Split was not created by BSONSplitter, and we need to find the
    // first document to begin iterating.
    FileSplit fileSplit = (FileSplit) split;
    BSONSplitter splitter = new BSONSplitter();
    splitter.setConf(context.getConfiguration());
    splitter.setInputPath(fileSplit.getPath());
    return new BSONFileRecordReader(splitter.getStartingPositionForSplit(fileSplit));
}
Also used : BSONFileRecordReader(com.mongodb.hadoop.input.BSONFileRecordReader) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit) BSONSplitter(com.mongodb.hadoop.splitter.BSONSplitter)

Aggregations

BSONSplitter (com.mongodb.hadoop.splitter.BSONSplitter)4 BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)2 BSONFileSplit (com.mongodb.hadoop.mapred.input.BSONFileSplit)2 BSONSplitter.getSplitsFilePath (com.mongodb.hadoop.splitter.BSONSplitter.getSplitsFilePath)2 ArrayList (java.util.ArrayList)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 FileSplit (org.apache.hadoop.mapred.FileSplit)2 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)2 BSONFileRecordReader (com.mongodb.hadoop.input.BSONFileRecordReader)1 BSONFileRecordReader (com.mongodb.hadoop.mapred.input.BSONFileRecordReader)1 Configuration (org.apache.hadoop.conf.Configuration)1 PathFilter (org.apache.hadoop.fs.PathFilter)1