Search in sources :

Example 1 with MongoInputSplit

use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.

the class SingleMongoSplitter method calculateSplits.

@Override
public List<InputSplit> calculateSplits() {
    if (LOG.isDebugEnabled()) {
        MongoClientURI inputURI = MongoConfigUtil.getInputURI(getConfiguration());
        LOG.debug(format("SingleMongoSplitter calculating splits for namespace: %s.%s; hosts: %s", inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
    }
    return Collections.singletonList((InputSplit) new MongoInputSplit(getConfiguration()));
}
Also used : MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) MongoClientURI(com.mongodb.MongoClientURI)

Example 2 with MongoInputSplit

use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.

the class StandaloneMongoSplitter method calculateSplits.

@Override
public List<InputSplit> calculateSplits() throws SplitFailedException {
    final DBObject splitKey = MongoConfigUtil.getInputSplitKey(getConfiguration());
    final DBObject splitKeyMax = MongoConfigUtil.getMaxSplitKey(getConfiguration());
    final DBObject splitKeyMin = MongoConfigUtil.getMinSplitKey(getConfiguration());
    final int splitSize = MongoConfigUtil.getSplitSize(getConfiguration());
    final MongoClientURI inputURI;
    DBCollection inputCollection = null;
    final ArrayList<InputSplit> returnVal;
    try {
        inputURI = MongoConfigUtil.getInputURI(getConfiguration());
        MongoClientURI authURI = MongoConfigUtil.getAuthURI(getConfiguration());
        if (authURI != null) {
            inputCollection = MongoConfigUtil.getCollectionWithAuth(inputURI, authURI);
        } else {
            inputCollection = MongoConfigUtil.getCollection(inputURI);
        }
        returnVal = new ArrayList<InputSplit>();
        final String ns = inputCollection.getFullName();
        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("Running splitVector on namespace: %s.%s; hosts: %s", inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
        }
        final DBObject cmd = BasicDBObjectBuilder.start("splitVector", ns).add("keyPattern", splitKey).add("min", splitKeyMin).add("max", splitKeyMax).add("force", false).add("maxChunkSize", splitSize).get();
        CommandResult data;
        boolean ok = true;
        try {
            data = inputCollection.getDB().getSisterDB(inputURI.getDatabase()).command(cmd, ReadPreference.primary());
        } catch (final MongoException e) {
            // 2.0 servers throw exceptions rather than info in a CommandResult
            data = null;
            LOG.info(e.getMessage(), e);
            if (e.getMessage().contains("unrecognized command: splitVector")) {
                ok = false;
            } else {
                throw e;
            }
        }
        if (data != null) {
            if (data.containsField("$err")) {
                throw new SplitFailedException("Error calculating splits: " + data);
            } else if (!data.get("ok").equals(1.0)) {
                ok = false;
            }
        }
        if (!ok) {
            final CommandResult stats = inputCollection.getStats();
            if (stats.containsField("primary")) {
                final DBCursor shards = inputCollection.getDB().getSisterDB("config").getCollection("shards").find(new BasicDBObject("_id", stats.getString("primary")));
                try {
                    if (shards.hasNext()) {
                        final DBObject shard = shards.next();
                        final String host = ((String) shard.get("host")).replace(shard.get("_id") + "/", "");
                        final MongoClientURI shardHost;
                        if (authURI != null) {
                            shardHost = new MongoClientURIBuilder(authURI).host(host).build();
                        } else {
                            shardHost = new MongoClientURIBuilder(inputURI).host(host).build();
                        }
                        MongoClient shardClient = null;
                        try {
                            shardClient = new MongoClient(shardHost);
                            data = shardClient.getDB(shardHost.getDatabase()).command(cmd, ReadPreference.primary());
                        } catch (final Exception e) {
                            LOG.error(e.getMessage(), e);
                        } finally {
                            if (shardClient != null) {
                                shardClient.close();
                            }
                        }
                    }
                } finally {
                    shards.close();
                }
            }
            if (data != null && !data.get("ok").equals(1.0)) {
                throw new SplitFailedException("Unable to calculate input splits: " + data.get("errmsg"));
            }
        }
        // Comes in a format where "min" and "max" are implicit
        // and each entry is just a boundary key; not ranged
        final BasicDBList splitData = (BasicDBList) data.get("splitKeys");
        if (splitData.size() == 0) {
            LOG.warn("WARNING: No Input Splits were calculated by the split code. Proceeding with a *single* split. Data may be too" + " small, try lowering 'mongo.input.split_size' if this is undesirable.");
        }
        // Lower boundary of the first min split
        BasicDBObject lastKey = null;
        // If splitKeyMin was given, use it as first boundary.
        if (!splitKeyMin.toMap().isEmpty()) {
            lastKey = new BasicDBObject(splitKeyMin.toMap());
        }
        for (final Object aSplitData : splitData) {
            final BasicDBObject currentKey = (BasicDBObject) aSplitData;
            returnVal.add(createSplitFromBounds(lastKey, currentKey));
            lastKey = currentKey;
        }
        BasicDBObject maxKey = null;
        // If splitKeyMax was given, use it as last boundary.
        if (!splitKeyMax.toMap().isEmpty()) {
            maxKey = new BasicDBObject(splitKeyMax.toMap());
        }
        // Last max split
        final MongoInputSplit lastSplit = createSplitFromBounds(lastKey, maxKey);
        returnVal.add(lastSplit);
    } finally {
        if (inputCollection != null) {
            MongoConfigUtil.close(inputCollection.getDB().getMongo());
        }
    }
    if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
        return filterEmptySplits(returnVal);
    }
    return returnVal;
}
Also used : MongoException(com.mongodb.MongoException) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) MongoClientURI(com.mongodb.MongoClientURI) BasicDBObject(com.mongodb.BasicDBObject) DBObject(com.mongodb.DBObject) MongoException(com.mongodb.MongoException) CommandResult(com.mongodb.CommandResult) DBCollection(com.mongodb.DBCollection) BasicDBObject(com.mongodb.BasicDBObject) MongoClient(com.mongodb.MongoClient) BasicDBList(com.mongodb.BasicDBList) DBCursor(com.mongodb.DBCursor) MongoClientURIBuilder(com.mongodb.hadoop.util.MongoClientURIBuilder) BasicDBObject(com.mongodb.BasicDBObject) DBObject(com.mongodb.DBObject) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit)

Example 3 with MongoInputSplit

use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.

the class MongoCollectionSplitter method createRangeQuerySplit.

/**
     * Creates an instance of {@link MongoInputSplit} whose upper and lower
     * bounds are restricted by adding $gte/$lt clauses to the query
     * filter. This requires that the boundaries are not compound keys, and that
     * the query does not contain any keys used in the split key.
     *
     * @param chunkLowerBound the lower bound of the chunk (min)
     * @param chunkUpperBound the upper bound of the chunk (max)
     * @param query a query filtering the documents within the split
     * @return a MongoInputSplit from a range query
     * @throws IllegalArgumentException if the query conflicts with the chunk bounds, or the either of the bounds are compound keys.
     */
public MongoInputSplit createRangeQuerySplit(final BasicDBObject chunkLowerBound, final BasicDBObject chunkUpperBound, final BSONObject query) {
    //a split without boundaries.
    if (chunkLowerBound == null && chunkUpperBound == null) {
        DBObject splitQuery = new BasicDBObject();
        splitQuery.putAll(query);
        MongoInputSplit split = new MongoInputSplit(getConfiguration());
        split.setQuery(splitQuery);
        return split;
    }
    // The boundaries are not empty, so try to build a split using $gte/$lt.
    //First check that the split contains no compound keys.
    // e.g. this is valid: { _id : "foo" }
    // but this is not {_id : "foo", name : "bar"}
    Entry<String, Object> minKey = chunkLowerBound != null && chunkLowerBound.keySet().size() == 1 ? chunkLowerBound.entrySet().iterator().next() : null;
    Entry<String, Object> maxKey = chunkUpperBound != null && chunkUpperBound.keySet().size() == 1 ? chunkUpperBound.entrySet().iterator().next() : null;
    if (minKey == null && maxKey == null) {
        throw new IllegalArgumentException("Range query is enabled but one or more split boundaries contains a compound key:\n" + "min:  " + chunkLowerBound + "\nmax:  " + chunkUpperBound);
    }
    //which overlap with the query.
    if (minKey != null && query.containsField(minKey.getKey()) || maxKey != null && query.containsField(maxKey.getKey())) {
        throw new IllegalArgumentException("Range query is enabled but split key conflicts with query filter:\n" + "min:  " + chunkLowerBound + "\nmax:  " + chunkUpperBound + "\nquery:  " + query);
    }
    String key = null;
    BasicDBObject rangeObj = new BasicDBObject();
    if (minKey != null) {
        key = minKey.getKey();
        rangeObj.put("$gte", minKey.getValue());
    }
    if (maxKey != null) {
        key = maxKey.getKey();
        rangeObj.put("$lt", maxKey.getValue());
    }
    DBObject splitQuery = new BasicDBObject();
    splitQuery.putAll(query);
    splitQuery.put(key, rangeObj);
    MongoInputSplit split = new MongoInputSplit(getConfiguration());
    split.setQuery(splitQuery);
    return split;
}
Also used : BasicDBObject(com.mongodb.BasicDBObject) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) BasicDBObject(com.mongodb.BasicDBObject) BSONObject(org.bson.BSONObject) DBObject(com.mongodb.DBObject) BasicDBObject(com.mongodb.BasicDBObject) DBObject(com.mongodb.DBObject)

Example 4 with MongoInputSplit

use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.

the class ShardMongoSplitter method calculateSplits.

// Treat each shard as one split.
@Override
public List<InputSplit> calculateSplits() throws SplitFailedException {
    final ArrayList<InputSplit> returnVal = new ArrayList<InputSplit>();
    MongoClientURI inputURI = MongoConfigUtil.getInputURI(getConfiguration());
    Map<String, List<String>> shardsMap;
    try {
        shardsMap = getShardsMap();
        for (Entry<String, List<String>> entry : shardsMap.entrySet()) {
            List<String> shardHosts = entry.getValue();
            MongoInputSplit chunkSplit = createSplitFromBounds(null, null);
            chunkSplit.setInputURI(rewriteURI(inputURI, shardHosts));
            returnVal.add(chunkSplit);
        }
    } finally {
        // getShardsMap() creates a client to a config server. Close it now.
        MongoConfigUtil.close(getConfigDB().getMongo());
    }
    if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
        return filterEmptySplits(returnVal);
    }
    return returnVal;
}
Also used : MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) MongoClientURI(com.mongodb.MongoClientURI) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit)

Example 5 with MongoInputSplit

use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.

the class MongoPaginatingSplitterTest method testQuery.

@Test
public void testQuery() throws SplitFailedException {
    Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, uri);
    MongoConfigUtil.setRangeQueryEnabled(conf, true);
    MongoConfigUtil.setInputSplitMinDocs(conf, 5000);
    DBObject query = new BasicDBObject("$or", new BasicDBObject[] { new BasicDBObject("value", new BasicDBObject("$lt", 25000)), new BasicDBObject("value", new BasicDBObject("$gte", 31000)) });
    MongoConfigUtil.setQuery(conf, query);
    MongoPaginatingSplitter splitter = new MongoPaginatingSplitter(conf);
    List<InputSplit> splits = splitter.calculateSplits();
    assertEquals(7, splits.size());
    assertSplitRange((MongoInputSplit) splits.get(0), null, 5000);
    assertSplitRange((MongoInputSplit) splits.get(1), 5000, 10000);
    assertSplitRange((MongoInputSplit) splits.get(2), 10000, 15000);
    assertSplitRange((MongoInputSplit) splits.get(3), 15000, 20000);
    assertSplitRange((MongoInputSplit) splits.get(4), 20000, 31000);
    assertSplitRange((MongoInputSplit) splits.get(5), 31000, 36000);
    assertSplitRange((MongoInputSplit) splits.get(6), 36000, null);
    // 6000 documents excluded by query.
    assertSplitsCount(collection.count() - 6000, splits);
}
Also used : BasicDBObject(com.mongodb.BasicDBObject) Configuration(org.apache.hadoop.conf.Configuration) BasicDBObject(com.mongodb.BasicDBObject) DBObject(com.mongodb.DBObject) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) Test(org.junit.Test)

Aggregations

MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)21 Test (org.junit.Test)13 BasicDBObject (com.mongodb.BasicDBObject)12 Configuration (org.apache.hadoop.conf.Configuration)12 InputSplit (org.apache.hadoop.mapreduce.InputSplit)11 DBObject (com.mongodb.DBObject)7 MongoClientURI (com.mongodb.MongoClientURI)5 BasicDBList (com.mongodb.BasicDBList)3 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)3 DBCollection (com.mongodb.DBCollection)2 MongoClient (com.mongodb.MongoClient)2 MongoClientURIBuilder (com.mongodb.hadoop.util.MongoClientURIBuilder)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 BSONObject (org.bson.BSONObject)2 CommandResult (com.mongodb.CommandResult)1 DBCursor (com.mongodb.DBCursor)1 MongoException (com.mongodb.MongoException)1 MongoRecordReader (com.mongodb.hadoop.input.MongoRecordReader)1 MongoRecordReader (com.mongodb.hadoop.mapred.input.MongoRecordReader)1