use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.
the class ShardChunkMongoSplitter method calculateSplitsFromChunks.
/**
* Get a list of InputSplits based on a list of MongoDB shard chunks, the shard key, and a
* mapping of shard names to host names. This is used internally by {@link #calculateSplits()}.
*
* @param chunks Chunk documents from the config.chunks collection.
* @param shardsMap A map of shard name -> an array of hostnames.
* @return A list of InputSplits.
*/
List<InputSplit> calculateSplitsFromChunks(final List<DBObject> chunks, final Map<String, List<String>> shardsMap) throws SplitFailedException {
boolean targetShards = MongoConfigUtil.canReadSplitsFromShards(getConfiguration());
List<String> mongosHostNames = MongoConfigUtil.getInputMongosHosts(getConfiguration());
MongoClientURI inputURI = MongoConfigUtil.getInputURI(getConfiguration());
if (targetShards && mongosHostNames.size() > 0) {
throw new SplitFailedException("Setting both mongo.input.split.read_from_shards and mongo.input.mongos_hosts" + " does not make sense. ");
}
Map<String, String> mongosMap = null;
if (mongosHostNames.size() > 0) {
// Build a map of host -> mongos host string (incl. port)
mongosMap = new HashMap<String, String>();
for (String mongosHostName : mongosHostNames) {
String[] hostAndPort = mongosHostName.split(":");
mongosMap.put(hostAndPort[0], mongosHostName);
}
}
List<InputSplit> splits = new ArrayList<InputSplit>(chunks.size());
for (DBObject chunk : chunks) {
BasicDBObject chunkLowerBound = (BasicDBObject) chunk.get("min");
BasicDBObject chunkUpperBound = (BasicDBObject) chunk.get("max");
MongoInputSplit chunkSplit = createSplitFromBounds(chunkLowerBound, chunkUpperBound);
chunkSplit.setInputURI(inputURI);
String shard = (String) chunk.get("shard");
if (targetShards) {
//The job is configured to target shards, so replace the
//mongos hostname with the host of the shard's servers
List<String> shardHosts = shardsMap.get(shard);
if (shardHosts == null) {
throw new SplitFailedException("Couldn't find shard ID: " + shard + " in config.shards.");
}
MongoClientURI newURI = rewriteURI(inputURI, shardHosts);
chunkSplit.setInputURI(newURI);
} else if (mongosMap != null) {
// Try to use a mongos collocated with one of the shard hosts for the input
// split. If the user has their Hadoop/MongoDB clusters configured correctly,
// this will allow for reading without having to transfer data over a network.
// Note that MongoInputSplit.getLocations() just returns the hostnames from its
// input URI.
List<String> chunkHosts = shardsMap.get(shard);
String mongosHost = null;
for (String chunkHost : chunkHosts) {
String[] hostAndPort = chunkHost.split(":");
mongosHost = mongosMap.get(hostAndPort[0]);
if (mongosHost != null) {
break;
}
}
if (null == mongosHost) {
// Fall back just to using the given input URI.
chunkSplit.setInputURI(inputURI);
} else {
LOG.info("Will read split " + chunkSplit + " from mongos " + mongosHost);
chunkSplit.setInputURI(rewriteURI(inputURI, mongosHost));
}
}
// Add this split to the list for the current shard.
chunkSplit.setKeyField(MongoConfigUtil.getInputKey(getConfiguration()));
splits.add(chunkSplit);
}
if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
return filterEmptySplits(splits);
}
return splits;
}
Aggregations