Search in sources :

Example 1 with BSONFileSplit

use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.

the class BSONSplitter method readSplits.

/**
     * Calculate splits for each file in the input path, sensitive to options such
     * as {@link com.mongodb.hadoop.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}.
     * This method always re-calculates the splits and will try to write the
     * splits file.
     *
     * @see #readSplitsForFile
     *
     * @throws IOException when an error occurs reading from the file
     */
public void readSplits() throws IOException {
    splitsList = new ArrayList<BSONFileSplit>();
    if (inputPath == null) {
        throw new IllegalStateException("Input path has not been set.");
    }
    FileSystem fs = inputPath.getFileSystem(getConf());
    FileStatus file = fs.getFileStatus(inputPath);
    readSplitsForFile(file);
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit)

Example 2 with BSONFileSplit

use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.

the class BSONSplitter method splitFile.

/**
     * Calculate the splits for a given input file according to the settings
     * for split size only. This method does not respect options like
     * {@link com.mongodb.hadoop.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}.
     *
     * @param file the FileStatus for which to calculate splits.
     * @return a List of the calculated splits.
     *
     * @throws IOException when an error occurs reading from the FileSystem
     */
protected List<BSONFileSplit> splitFile(final FileStatus file) throws IOException {
    Path path = file.getPath();
    ArrayList<BSONFileSplit> splits = new ArrayList<BSONFileSplit>();
    FileSystem fs = path.getFileSystem(getConf());
    long length = file.getLen();
    int numDocsRead = 0;
    long splitSize = getSplitSize(getConf(), file);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Generating splits for " + path + " of up to " + splitSize + " bytes.");
    }
    FSDataInputStream fsDataStream = fs.open(path);
    long curSplitLen = 0;
    long curSplitStart = 0;
    try {
        while (fsDataStream.getPos() + 1 < length) {
            lazyCallback.reset();
            lazyDec.decode(fsDataStream, lazyCallback);
            LazyBSONObject bo = (LazyBSONObject) lazyCallback.get();
            int bsonDocSize = bo.getBSONSize();
            if (curSplitLen + bsonDocSize >= splitSize) {
                BSONFileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
                splits.add(split);
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("Creating new split (%d) %s", splits.size(), split));
                }
                curSplitStart = fsDataStream.getPos() - bsonDocSize;
                curSplitLen = 0;
            }
            curSplitLen += bsonDocSize;
            numDocsRead++;
            if (numDocsRead % 1000 == 0) {
                float splitProgress = 100f * ((float) fsDataStream.getPos() / length);
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("Read %d docs calculating splits for %s; %3.3f%% complete.", numDocsRead, file.getPath(), splitProgress));
                }
            }
        }
        if (curSplitLen > 0) {
            BSONFileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
            splits.add(split);
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("Final split (%d) %s", splits.size(), split.getPath()));
            }
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug("Completed splits calculation for " + file.getPath());
        }
    } catch (IOException e) {
        LOG.warn("IOException: " + e);
    } finally {
        fsDataStream.close();
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) LazyBSONObject(org.bson.LazyBSONObject) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) IOException(java.io.IOException)

Example 3 with BSONFileSplit

use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.

the class BSONSplitter method readSplitsForFile.

/**
     * Calculate the splits for a given input file, sensitive to options such
     * as {@link com.mongodb.hadoop.util.MongoConfigUtil#BSON_READ_SPLITS bson.split.read_splits}.
     * This method always re-calculates the splits and will try to write the
     * splits file.
     *
     * @param file the FileStatus for which to calculate splits.
     * @throws IOException when an error occurs reading from the FileSystem
     *
     * @see #readSplits
     */
public void readSplitsForFile(final FileStatus file) throws IOException {
    long length = file.getLen();
    if (!MongoConfigUtil.getBSONReadSplits(getConf())) {
        LOG.info("Reading splits is disabled - constructing single split for " + file);
        FileSystem fs = file.getPath().getFileSystem(getConf());
        BSONFileSplit onesplit = createFileSplit(file, fs, 0, length);
        ArrayList<BSONFileSplit> splits = new ArrayList<BSONFileSplit>();
        splits.add(onesplit);
        splitsList = splits;
        return;
    }
    if (length != 0) {
        splitsList = (ArrayList<BSONFileSplit>) splitFile(file);
        writeSplits();
    } else {
        LOG.warn("Zero-length file, skipping split calculation.");
    }
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit) ArrayList(java.util.ArrayList)

Example 4 with BSONFileSplit

use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.

the class BSONFileRecordReaderTest method testGetCurrentKey.

@Test
public void testGetCurrentKey() throws Exception {
    URI path = BSONFileRecordReaderTest.class.getResource("/bookstore-dump/inventory.bson").toURI();
    File file = new File(path);
    // Default case: "_id" is used as inputKey.
    BSONFileRecordReader reader = new BSONFileRecordReader();
    BSONFileSplit split = new BSONFileSplit(new Path(path), 0, file.length(), new String[0]);
    JobConf conf = new JobConf();
    reader.init(split, conf);
    assertTrue(reader.nextKeyValue());
    assertEquals(reader.getCurrentKey(), new ObjectId("4d2a6084c6237b412fcd5597"));
    // Use a nested field as inputKey.
    reader = new BSONFileRecordReader();
    split = new BSONFileSplit(new Path(path), 0, file.length(), new String[0]);
    split.setKeyField("price.msrp");
    reader.init(split, conf);
    assertTrue(reader.nextKeyValue());
    assertEquals(reader.getCurrentKey(), 33);
    // Use a key within an array as the inputKey.
    reader = new BSONFileRecordReader();
    split = new BSONFileSplit(new Path(path), 0, file.length(), new String[0]);
    split.setKeyField("tags.0");
    reader.init(split, conf);
    assertTrue(reader.nextKeyValue());
    assertEquals(reader.getCurrentKey(), "html5");
}
Also used : Path(org.apache.hadoop.fs.Path) BSONFileRecordReader(com.mongodb.hadoop.input.BSONFileRecordReader) ObjectId(org.bson.types.ObjectId) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit) URI(java.net.URI) File(java.io.File) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 5 with BSONFileSplit

use of com.mongodb.hadoop.input.BSONFileSplit in project mongo-hadoop by mongodb.

the class BSONSplitterTest method testCreateFileSplitFromBSON.

@Test
public void testCreateFileSplitFromBSON() throws IOException {
    BSONObject splitSpec = new BasicBSONObject();
    splitSpec.put("s", 0L);
    splitSpec.put("l", file.getLen());
    BSONFileSplit splitResult = SPLITTER.createFileSplitFromBSON(splitSpec, fs, file);
    assertOneSplit(splitResult);
}
Also used : BasicBSONObject(org.bson.BasicBSONObject) BasicBSONObject(org.bson.BasicBSONObject) BSONObject(org.bson.BSONObject) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit) Test(org.junit.Test)

Aggregations

BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)14 FileSystem (org.apache.hadoop.fs.FileSystem)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)5 Test (org.junit.Test)5 Path (org.apache.hadoop.fs.Path)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 FileStatus (org.apache.hadoop.fs.FileStatus)3 BSONObject (org.bson.BSONObject)3 BSONFileRecordReader (com.mongodb.hadoop.input.BSONFileRecordReader)2 Configuration (org.apache.hadoop.conf.Configuration)2 BasicBSONObject (org.bson.BasicBSONObject)2 LazyBSONObject (org.bson.LazyBSONObject)2 BSONSplitter (com.mongodb.hadoop.splitter.BSONSplitter)1 File (java.io.File)1 URI (java.net.URI)1 Configurable (org.apache.hadoop.conf.Configurable)1 BlockLocation (org.apache.hadoop.fs.BlockLocation)1 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)1 CompressionOutputStream (org.apache.hadoop.io.compress.CompressionOutputStream)1