use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.
the class SingleMongoSplitter method calculateSplits.
@Override
public List<InputSplit> calculateSplits() {
if (LOG.isDebugEnabled()) {
MongoClientURI inputURI = MongoConfigUtil.getInputURI(getConfiguration());
LOG.debug(format("SingleMongoSplitter calculating splits for namespace: %s.%s; hosts: %s", inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
}
return Collections.singletonList((InputSplit) new MongoInputSplit(getConfiguration()));
}
use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.
the class StandaloneMongoSplitter method calculateSplits.
@Override
public List<InputSplit> calculateSplits() throws SplitFailedException {
final DBObject splitKey = MongoConfigUtil.getInputSplitKey(getConfiguration());
final DBObject splitKeyMax = MongoConfigUtil.getMaxSplitKey(getConfiguration());
final DBObject splitKeyMin = MongoConfigUtil.getMinSplitKey(getConfiguration());
final int splitSize = MongoConfigUtil.getSplitSize(getConfiguration());
final MongoClientURI inputURI;
DBCollection inputCollection = null;
final ArrayList<InputSplit> returnVal;
try {
inputURI = MongoConfigUtil.getInputURI(getConfiguration());
MongoClientURI authURI = MongoConfigUtil.getAuthURI(getConfiguration());
if (authURI != null) {
inputCollection = MongoConfigUtil.getCollectionWithAuth(inputURI, authURI);
} else {
inputCollection = MongoConfigUtil.getCollection(inputURI);
}
returnVal = new ArrayList<InputSplit>();
final String ns = inputCollection.getFullName();
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Running splitVector on namespace: %s.%s; hosts: %s", inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
}
final DBObject cmd = BasicDBObjectBuilder.start("splitVector", ns).add("keyPattern", splitKey).add("min", splitKeyMin).add("max", splitKeyMax).add("force", false).add("maxChunkSize", splitSize).get();
CommandResult data;
boolean ok = true;
try {
data = inputCollection.getDB().getSisterDB(inputURI.getDatabase()).command(cmd, ReadPreference.primary());
} catch (final MongoException e) {
// 2.0 servers throw exceptions rather than info in a CommandResult
data = null;
LOG.info(e.getMessage(), e);
if (e.getMessage().contains("unrecognized command: splitVector")) {
ok = false;
} else {
throw e;
}
}
if (data != null) {
if (data.containsField("$err")) {
throw new SplitFailedException("Error calculating splits: " + data);
} else if (!data.get("ok").equals(1.0)) {
ok = false;
}
}
if (!ok) {
final CommandResult stats = inputCollection.getStats();
if (stats.containsField("primary")) {
final DBCursor shards = inputCollection.getDB().getSisterDB("config").getCollection("shards").find(new BasicDBObject("_id", stats.getString("primary")));
try {
if (shards.hasNext()) {
final DBObject shard = shards.next();
final String host = ((String) shard.get("host")).replace(shard.get("_id") + "/", "");
final MongoClientURI shardHost;
if (authURI != null) {
shardHost = new MongoClientURIBuilder(authURI).host(host).build();
} else {
shardHost = new MongoClientURIBuilder(inputURI).host(host).build();
}
MongoClient shardClient = null;
try {
shardClient = new MongoClient(shardHost);
data = shardClient.getDB(shardHost.getDatabase()).command(cmd, ReadPreference.primary());
} catch (final Exception e) {
LOG.error(e.getMessage(), e);
} finally {
if (shardClient != null) {
shardClient.close();
}
}
}
} finally {
shards.close();
}
}
if (data != null && !data.get("ok").equals(1.0)) {
throw new SplitFailedException("Unable to calculate input splits: " + data.get("errmsg"));
}
}
// Comes in a format where "min" and "max" are implicit
// and each entry is just a boundary key; not ranged
final BasicDBList splitData = (BasicDBList) data.get("splitKeys");
if (splitData.size() == 0) {
LOG.warn("WARNING: No Input Splits were calculated by the split code. Proceeding with a *single* split. Data may be too" + " small, try lowering 'mongo.input.split_size' if this is undesirable.");
}
// Lower boundary of the first min split
BasicDBObject lastKey = null;
// If splitKeyMin was given, use it as first boundary.
if (!splitKeyMin.toMap().isEmpty()) {
lastKey = new BasicDBObject(splitKeyMin.toMap());
}
for (final Object aSplitData : splitData) {
final BasicDBObject currentKey = (BasicDBObject) aSplitData;
returnVal.add(createSplitFromBounds(lastKey, currentKey));
lastKey = currentKey;
}
BasicDBObject maxKey = null;
// If splitKeyMax was given, use it as last boundary.
if (!splitKeyMax.toMap().isEmpty()) {
maxKey = new BasicDBObject(splitKeyMax.toMap());
}
// Last max split
final MongoInputSplit lastSplit = createSplitFromBounds(lastKey, maxKey);
returnVal.add(lastSplit);
} finally {
if (inputCollection != null) {
MongoConfigUtil.close(inputCollection.getDB().getMongo());
}
}
if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
return filterEmptySplits(returnVal);
}
return returnVal;
}
use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.
the class MongoCollectionSplitter method createRangeQuerySplit.
/**
* Creates an instance of {@link MongoInputSplit} whose upper and lower
* bounds are restricted by adding $gte/$lt clauses to the query
* filter. This requires that the boundaries are not compound keys, and that
* the query does not contain any keys used in the split key.
*
* @param chunkLowerBound the lower bound of the chunk (min)
* @param chunkUpperBound the upper bound of the chunk (max)
* @param query a query filtering the documents within the split
* @return a MongoInputSplit from a range query
* @throws IllegalArgumentException if the query conflicts with the chunk bounds, or the either of the bounds are compound keys.
*/
public MongoInputSplit createRangeQuerySplit(final BasicDBObject chunkLowerBound, final BasicDBObject chunkUpperBound, final BSONObject query) {
//a split without boundaries.
if (chunkLowerBound == null && chunkUpperBound == null) {
DBObject splitQuery = new BasicDBObject();
splitQuery.putAll(query);
MongoInputSplit split = new MongoInputSplit(getConfiguration());
split.setQuery(splitQuery);
return split;
}
// The boundaries are not empty, so try to build a split using $gte/$lt.
//First check that the split contains no compound keys.
// e.g. this is valid: { _id : "foo" }
// but this is not {_id : "foo", name : "bar"}
Entry<String, Object> minKey = chunkLowerBound != null && chunkLowerBound.keySet().size() == 1 ? chunkLowerBound.entrySet().iterator().next() : null;
Entry<String, Object> maxKey = chunkUpperBound != null && chunkUpperBound.keySet().size() == 1 ? chunkUpperBound.entrySet().iterator().next() : null;
if (minKey == null && maxKey == null) {
throw new IllegalArgumentException("Range query is enabled but one or more split boundaries contains a compound key:\n" + "min: " + chunkLowerBound + "\nmax: " + chunkUpperBound);
}
//which overlap with the query.
if (minKey != null && query.containsField(minKey.getKey()) || maxKey != null && query.containsField(maxKey.getKey())) {
throw new IllegalArgumentException("Range query is enabled but split key conflicts with query filter:\n" + "min: " + chunkLowerBound + "\nmax: " + chunkUpperBound + "\nquery: " + query);
}
String key = null;
BasicDBObject rangeObj = new BasicDBObject();
if (minKey != null) {
key = minKey.getKey();
rangeObj.put("$gte", minKey.getValue());
}
if (maxKey != null) {
key = maxKey.getKey();
rangeObj.put("$lt", maxKey.getValue());
}
DBObject splitQuery = new BasicDBObject();
splitQuery.putAll(query);
splitQuery.put(key, rangeObj);
MongoInputSplit split = new MongoInputSplit(getConfiguration());
split.setQuery(splitQuery);
return split;
}
use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.
the class ShardMongoSplitter method calculateSplits.
// Treat each shard as one split.
@Override
public List<InputSplit> calculateSplits() throws SplitFailedException {
final ArrayList<InputSplit> returnVal = new ArrayList<InputSplit>();
MongoClientURI inputURI = MongoConfigUtil.getInputURI(getConfiguration());
Map<String, List<String>> shardsMap;
try {
shardsMap = getShardsMap();
for (Entry<String, List<String>> entry : shardsMap.entrySet()) {
List<String> shardHosts = entry.getValue();
MongoInputSplit chunkSplit = createSplitFromBounds(null, null);
chunkSplit.setInputURI(rewriteURI(inputURI, shardHosts));
returnVal.add(chunkSplit);
}
} finally {
// getShardsMap() creates a client to a config server. Close it now.
MongoConfigUtil.close(getConfigDB().getMongo());
}
if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
return filterEmptySplits(returnVal);
}
return returnVal;
}
use of com.mongodb.hadoop.input.MongoInputSplit in project mongo-hadoop by mongodb.
the class MongoPaginatingSplitterTest method testQuery.
@Test
public void testQuery() throws SplitFailedException {
Configuration conf = new Configuration();
MongoConfigUtil.setInputURI(conf, uri);
MongoConfigUtil.setRangeQueryEnabled(conf, true);
MongoConfigUtil.setInputSplitMinDocs(conf, 5000);
DBObject query = new BasicDBObject("$or", new BasicDBObject[] { new BasicDBObject("value", new BasicDBObject("$lt", 25000)), new BasicDBObject("value", new BasicDBObject("$gte", 31000)) });
MongoConfigUtil.setQuery(conf, query);
MongoPaginatingSplitter splitter = new MongoPaginatingSplitter(conf);
List<InputSplit> splits = splitter.calculateSplits();
assertEquals(7, splits.size());
assertSplitRange((MongoInputSplit) splits.get(0), null, 5000);
assertSplitRange((MongoInputSplit) splits.get(1), 5000, 10000);
assertSplitRange((MongoInputSplit) splits.get(2), 10000, 15000);
assertSplitRange((MongoInputSplit) splits.get(3), 15000, 20000);
assertSplitRange((MongoInputSplit) splits.get(4), 20000, 31000);
assertSplitRange((MongoInputSplit) splits.get(5), 31000, 36000);
assertSplitRange((MongoInputSplit) splits.get(6), 36000, null);
// 6000 documents excluded by query.
assertSplitsCount(collection.count() - 6000, splits);
}
Aggregations