Search in sources :

Example 1 with LazyBSONObject

use of org.bson.LazyBSONObject in project mongo-hadoop by mongodb.

the class BSONSplitter method splitFile.

/**
 * Calculate the splits for a given input file according to the settings
 * for split size only. This method does not respect options like
 * {@link com.mongodb.hadoop.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}.
 *
 * @param file the FileStatus for which to calculate splits.
 * @return a List of the calculated splits.
 *
 * @throws IOException when an error occurs reading from the FileSystem
 */
protected List<BSONFileSplit> splitFile(final FileStatus file) throws IOException {
    Path path = file.getPath();
    ArrayList<BSONFileSplit> splits = new ArrayList<BSONFileSplit>();
    FileSystem fs = path.getFileSystem(getConf());
    long length = file.getLen();
    int numDocsRead = 0;
    long splitSize = getSplitSize(getConf(), file);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Generating splits for " + path + " of up to " + splitSize + " bytes.");
    }
    FSDataInputStream fsDataStream = fs.open(path);
    long curSplitLen = 0;
    long curSplitStart = 0;
    try {
        while (fsDataStream.getPos() + 1 < length) {
            lazyCallback.reset();
            lazyDec.decode(fsDataStream, lazyCallback);
            LazyBSONObject bo = (LazyBSONObject) lazyCallback.get();
            int bsonDocSize = bo.getBSONSize();
            if (curSplitLen + bsonDocSize >= splitSize) {
                BSONFileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
                splits.add(split);
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("Creating new split (%d) %s", splits.size(), split));
                }
                curSplitStart = fsDataStream.getPos() - bsonDocSize;
                curSplitLen = 0;
            }
            curSplitLen += bsonDocSize;
            numDocsRead++;
            if (numDocsRead % 1000 == 0) {
                float splitProgress = 100f * ((float) fsDataStream.getPos() / length);
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("Read %d docs calculating splits for %s; %3.3f%% complete.", numDocsRead, file.getPath(), splitProgress));
                }
            }
        }
        if (curSplitLen > 0) {
            BSONFileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
            splits.add(split);
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("Final split (%d) %s", splits.size(), split.getPath()));
            }
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("Completed splits calculation for " + file.getPath());
        }
    } catch (IOException e) {
        LOG.warn("IOException: " + e);
    } finally {
        fsDataStream.close();
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) LazyBSONObject(org.bson.LazyBSONObject) FileSystem(org.apache.hadoop.fs.FileSystem) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit) ArrayList(java.util.ArrayList) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) IOException(java.io.IOException)

Aggregations

BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 LazyBSONObject (org.bson.LazyBSONObject)1