Search in sources :

Example 21 with MongoInputSplit

use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.

the class ShardChunkMongoSplitter method calculateSplitsFromChunks.

/**
     * Get a list of InputSplits based on a list of MongoDB shard chunks, the shard key, and a
     * mapping of shard names to host names. This is used internally by {@link #calculateSplits()}.
     *
     * @param chunks Chunk documents from the config.chunks collection.
     * @param shardsMap A map of shard name -> an array of hostnames.
     * @return A list of InputSplits.
     */
List<InputSplit> calculateSplitsFromChunks(final List<DBObject> chunks, final Map<String, List<String>> shardsMap) throws SplitFailedException {
    boolean targetShards = MongoConfigUtil.canReadSplitsFromShards(getConfiguration());
    List<String> mongosHostNames = MongoConfigUtil.getInputMongosHosts(getConfiguration());
    MongoClientURI inputURI = MongoConfigUtil.getInputURI(getConfiguration());
    if (targetShards && mongosHostNames.size() > 0) {
        throw new SplitFailedException("Setting both mongo.input.split.read_from_shards and mongo.input.mongos_hosts" + " does not make sense. ");
    }
    Map<String, String> mongosMap = null;
    if (mongosHostNames.size() > 0) {
        // Build a map of host -> mongos host string (incl. port)
        mongosMap = new HashMap<String, String>();
        for (String mongosHostName : mongosHostNames) {
            String[] hostAndPort = mongosHostName.split(":");
            mongosMap.put(hostAndPort[0], mongosHostName);
        }
    }
    List<InputSplit> splits = new ArrayList<InputSplit>(chunks.size());
    for (DBObject chunk : chunks) {
        BasicDBObject chunkLowerBound = (BasicDBObject) chunk.get("min");
        BasicDBObject chunkUpperBound = (BasicDBObject) chunk.get("max");
        MongoInputSplit chunkSplit = createSplitFromBounds(chunkLowerBound, chunkUpperBound);
        chunkSplit.setInputURI(inputURI);
        String shard = (String) chunk.get("shard");
        if (targetShards) {
            //The job is configured to target shards, so replace the
            //mongos hostname with the host of the shard's servers
            List<String> shardHosts = shardsMap.get(shard);
            if (shardHosts == null) {
                throw new SplitFailedException("Couldn't find shard ID: " + shard + " in config.shards.");
            }
            MongoClientURI newURI = rewriteURI(inputURI, shardHosts);
            chunkSplit.setInputURI(newURI);
        } else if (mongosMap != null) {
            // Try to use a mongos collocated with one of the shard hosts for the input
            // split. If the user has their Hadoop/MongoDB clusters configured correctly,
            // this will allow for reading without having to transfer data over a network.
            // Note that MongoInputSplit.getLocations() just returns the hostnames from its
            // input URI.
            List<String> chunkHosts = shardsMap.get(shard);
            String mongosHost = null;
            for (String chunkHost : chunkHosts) {
                String[] hostAndPort = chunkHost.split(":");
                mongosHost = mongosMap.get(hostAndPort[0]);
                if (mongosHost != null) {
                    break;
                }
            }
            if (null == mongosHost) {
                // Fall back just to using the given input URI.
                chunkSplit.setInputURI(inputURI);
            } else {
                LOG.info("Will read split " + chunkSplit + " from mongos " + mongosHost);
                chunkSplit.setInputURI(rewriteURI(inputURI, mongosHost));
            }
        }
        // Add this split to the list for the current shard.
        chunkSplit.setKeyField(MongoConfigUtil.getInputKey(getConfiguration()));
        splits.add(chunkSplit);
    }
    if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
        return filterEmptySplits(splits);
    }
    return splits;
}
Also used : MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) MongoClientURI(com.mongodb.MongoClientURI) ArrayList(java.util.ArrayList) BasicDBObject(com.mongodb.BasicDBObject) DBObject(com.mongodb.DBObject) BasicDBObject(com.mongodb.BasicDBObject) ArrayList(java.util.ArrayList) List(java.util.List) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit)

Aggregations

MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)21 Test (org.junit.Test)13 BasicDBObject (com.mongodb.BasicDBObject)12 Configuration (org.apache.hadoop.conf.Configuration)12 InputSplit (org.apache.hadoop.mapreduce.InputSplit)11 DBObject (com.mongodb.DBObject)7 MongoClientURI (com.mongodb.MongoClientURI)5 BasicDBList (com.mongodb.BasicDBList)3 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)3 DBCollection (com.mongodb.DBCollection)2 MongoClient (com.mongodb.MongoClient)2 MongoClientURIBuilder (com.mongodb.hadoop.util.MongoClientURIBuilder)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 BSONObject (org.bson.BSONObject)2 CommandResult (com.mongodb.CommandResult)1 DBCursor (com.mongodb.DBCursor)1 MongoException (com.mongodb.MongoException)1 MongoRecordReader (com.mongodb.hadoop.input.MongoRecordReader)1 MongoRecordReader (com.mongodb.hadoop.mapred.input.MongoRecordReader)1