Search in sources :

Example 51 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class MongoPaginatingSplitterTest method testQuery.

@Test
public void testQuery() throws SplitFailedException {
    Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, uri);
    MongoConfigUtil.setRangeQueryEnabled(conf, true);
    MongoConfigUtil.setInputSplitMinDocs(conf, 5000);
    DBObject query = new BasicDBObject("$or", new BasicDBObject[] { new BasicDBObject("value", new BasicDBObject("$lt", 25000)), new BasicDBObject("value", new BasicDBObject("$gte", 31000)) });
    MongoConfigUtil.setQuery(conf, query);
    MongoPaginatingSplitter splitter = new MongoPaginatingSplitter(conf);
    List<InputSplit> splits = splitter.calculateSplits();
    assertEquals(7, splits.size());
    assertSplitRange((MongoInputSplit) splits.get(0), null, 5000);
    assertSplitRange((MongoInputSplit) splits.get(1), 5000, 10000);
    assertSplitRange((MongoInputSplit) splits.get(2), 10000, 15000);
    assertSplitRange((MongoInputSplit) splits.get(3), 15000, 20000);
    assertSplitRange((MongoInputSplit) splits.get(4), 20000, 31000);
    assertSplitRange((MongoInputSplit) splits.get(5), 31000, 36000);
    assertSplitRange((MongoInputSplit) splits.get(6), 36000, null);
    // 6000 documents excluded by query.
    assertSplitsCount(collection.count() - 6000, splits);
}
Also used : BasicDBObject(com.mongodb.BasicDBObject) Configuration(org.apache.hadoop.conf.Configuration) BasicDBObject(com.mongodb.BasicDBObject) DBObject(com.mongodb.DBObject) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) Test(org.junit.Test)

Example 52 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class SampleSplitterTest method testAllOnOneSplit.

@Test
public void testAllOnOneSplit() throws SplitFailedException {
    assumeTrue(isSampleOperatorSupported(uri));
    Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, uri.getURI());
    // Split size is enough to encapsulate all documents.
    MongoConfigUtil.setSplitSize(conf, 12);
    splitter.setConfiguration(conf);
    List<InputSplit> splits = splitter.calculateSplits();
    assertEquals(1, splits.size());
    MongoInputSplit firstSplit = (MongoInputSplit) splits.get(0);
    assertTrue(firstSplit.getMin().toMap().isEmpty());
    assertTrue(firstSplit.getMax().toMap().isEmpty());
}
Also used : MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) Configuration(org.apache.hadoop.conf.Configuration) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) Test(org.junit.Test) BaseHadoopTest(com.mongodb.hadoop.testutils.BaseHadoopTest)

Example 53 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class SampleSplitterTest method testAlternateSplitKey.

@Test
public void testAlternateSplitKey() throws SplitFailedException {
    assumeTrue(isSampleOperatorSupported(uri));
    Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, uri.getURI());
    MongoConfigUtil.setSplitSize(conf, 1);
    MongoConfigUtil.setInputSplitKeyPattern(conf, "{\"i\": 1}");
    splitter.setConfiguration(conf);
    List<InputSplit> splits = splitter.calculateSplits();
    assertEquals(12, splits.size());
    MongoInputSplit firstSplit = (MongoInputSplit) splits.get(0);
    assertTrue(firstSplit.getMin().toMap().isEmpty());
    MongoInputSplit lastSplit = (MongoInputSplit) splits.get(11);
    assertTrue(lastSplit.getMax().toMap().isEmpty());
    // Ranges for splits are ascending.
    int lastKey = (Integer) firstSplit.getMax().get("i");
    for (int i = 1; i < splits.size() - 1; i++) {
        MongoInputSplit split = (MongoInputSplit) splits.get(i);
        int currentKey = (Integer) split.getMax().get("i");
        assertTrue(currentKey > lastKey);
        lastKey = currentKey;
    }
}
Also used : MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) Configuration(org.apache.hadoop.conf.Configuration) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) Test(org.junit.Test) BaseHadoopTest(com.mongodb.hadoop.testutils.BaseHadoopTest)

Example 54 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class StandaloneMongoSplitterTest method unshardedCollectionMinMax.

@Test
public void unshardedCollectionMinMax() throws UnknownHostException, SplitFailedException {
    Configuration config = new Configuration();
    StandaloneMongoSplitter splitter = new StandaloneMongoSplitter(config);
    MongoConfigUtil.setInputURI(config, uri);
    DBObject inputSplitKey = BasicDBObjectBuilder.start("value", 1).get();
    MongoConfigUtil.setInputSplitKey(config, inputSplitKey);
    MongoConfigUtil.setSplitSize(config, 1);
    List<InputSplit> regularSplits = splitter.calculateSplits();
    MongoConfigUtil.setMinSplitKey(config, "{value:100}");
    MongoConfigUtil.setMaxSplitKey(config, "{value:39900}");
    List<InputSplit> inputSplits = splitter.calculateSplits();
    assertTrue("should be fewer splits with min/max set", regularSplits.size() >= inputSplits.size());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) BasicDBObject(com.mongodb.BasicDBObject) DBObject(com.mongodb.DBObject) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) Test(org.junit.Test)

Example 55 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class StandaloneMongoSplitterTest method testFilterEmptySplits.

@Test
public void testFilterEmptySplits() throws SplitFailedException {
    Configuration config = new Configuration();
    DBObject query = new BasicDBObject("$or", new BasicDBObject[] { new BasicDBObject("value", new BasicDBObject("$lt", 20000)), new BasicDBObject("value", new BasicDBObject("$gt", 35000)) });
    MongoConfigUtil.setInputURI(config, uri);
    MongoConfigUtil.setEnableFilterEmptySplits(config, true);
    MongoConfigUtil.setQuery(config, query);
    // 1 MB per document results in 4 splits; the 3rd one is empty per
    // the above query.
    MongoConfigUtil.setSplitSize(config, 1);
    StandaloneMongoSplitter splitter = new StandaloneMongoSplitter(config);
    List<InputSplit> splits = splitter.calculateSplits();
    // No splits are empty.
    for (InputSplit split : splits) {
        // Cursor is closed on the split, so copy it to create a new one.
        MongoInputSplit mis = new MongoInputSplit((MongoInputSplit) split);
        assertNotEquals(0, mis.getCursor().itcount());
    }
    assertSplitsCount(collection.count(query), splits);
}
Also used : BasicDBObject(com.mongodb.BasicDBObject) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) Configuration(org.apache.hadoop.conf.Configuration) BasicDBObject(com.mongodb.BasicDBObject) DBObject(com.mongodb.DBObject) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9