use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.
the class GridFSInputFormatTest method testReadWholeFile.
@Test
public void testReadWholeFile() throws IOException, InterruptedException {
Configuration conf = getConfiguration();
MongoConfigUtil.setGridFSWholeFileSplit(conf, true);
JobContext jobContext = mockJobContext(conf);
List<InputSplit> splits = inputFormat.getSplits(jobContext);
// Empty delimiter == no delimiter.
MongoConfigUtil.setGridFSDelimiterPattern(conf, "#+");
TaskAttemptContext context = mockTaskAttemptContext(conf);
assertEquals(1, splits.size());
List<String> sections = new ArrayList<String>();
for (InputSplit split : splits) {
GridFSInputFormat.GridFSTextRecordReader reader = new GridFSInputFormat.GridFSTextRecordReader();
reader.initialize(split, context);
int i;
for (i = 0; reader.nextKeyValue(); ++i) {
sections.add(reader.getCurrentValue().toString());
}
}
assertEquals(Arrays.asList(readmeSections), sections);
}
use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.
the class GridFSInputFormatTest method testReadWholeFileNoDelimiter.
@Test
public void testReadWholeFileNoDelimiter() throws IOException, InterruptedException {
Configuration conf = getConfiguration();
MongoConfigUtil.setGridFSWholeFileSplit(conf, true);
JobContext jobContext = mockJobContext(conf);
List<InputSplit> splits = inputFormat.getSplits(jobContext);
// Empty delimiter == no delimiter.
MongoConfigUtil.setGridFSDelimiterPattern(conf, "");
TaskAttemptContext context = mockTaskAttemptContext(conf);
assertEquals(1, splits.size());
String fileText = null;
for (InputSplit split : splits) {
GridFSInputFormat.GridFSTextRecordReader reader = new GridFSInputFormat.GridFSTextRecordReader();
reader.initialize(split, context);
int i;
for (i = 0; reader.nextKeyValue(); ++i) {
fileText = reader.getCurrentValue().toString();
}
assertEquals(1, i);
}
assertEquals(fileContents.toString(), fileText);
}
use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.
the class GridFSInputFormatTest method testRecordReaderNoDelimiter.
@Test
public void testRecordReaderNoDelimiter() throws IOException, InterruptedException {
List<InputSplit> splits = getSplits();
Configuration conf = getConfiguration();
// Empty delimiter == no delimiter.
MongoConfigUtil.setGridFSDelimiterPattern(conf, "");
TaskAttemptContext context = mockTaskAttemptContext(conf);
StringBuilder fileText = new StringBuilder();
for (InputSplit split : splits) {
GridFSInputFormat.GridFSTextRecordReader reader = new GridFSInputFormat.GridFSTextRecordReader();
reader.initialize(split, context);
while (reader.nextKeyValue()) {
fileText.append(reader.getCurrentValue().toString());
}
}
assertEquals(fileContents.toString(), fileText.toString());
}
use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.
the class MultiMongoCollectionSplitter method calculateSplits.
@Override
public List<InputSplit> calculateSplits() throws SplitFailedException {
List<MongoClientURI> inputURIs = MongoConfigUtil.getMongoURIs(this.getConfiguration(), MongoConfigUtil.INPUT_URI);
List<InputSplit> returnVal = new LinkedList<InputSplit>();
List<MongoSplitter> splitters = new LinkedList<MongoSplitter>();
//splitter for each implementation.
if (inputURIs.size() > 0) {
if (LOG.isDebugEnabled()) {
LOG.debug("Using global split settings for multiple URIs specified.");
}
//the configuration instead.
for (MongoClientURI uri : inputURIs) {
MongoCollectionSplitter splitter;
Configuration confForThisUri = new Configuration(getConfiguration());
MongoConfigUtil.setInputURI(confForThisUri, uri);
confForThisUri.set(MongoConfigUtil.MONGO_SPLITTER_CLASS, "");
splitter = MongoSplitterFactory.getSplitterByStats(uri, confForThisUri);
splitters.add(splitter);
}
} else {
//Otherwise the user has set options per-collection.
if (LOG.isDebugEnabled()) {
LOG.debug("Loading multiple input URIs from JSON stored in " + MULTI_COLLECTION_CONF_KEY);
}
DBObject multiUriConfig = MongoConfigUtil.getDBObject(this.getConfiguration(), MULTI_COLLECTION_CONF_KEY);
if (!(multiUriConfig instanceof List)) {
throw new IllegalArgumentException("Invalid JSON format in multi uri config key: Must be an array where each element " + "is an object describing the URI and config options for each split.");
}
for (Object obj : (List) multiUriConfig) {
Map<String, Object> configMap;
MongoClientURI inputURI;
Configuration confForThisUri;
try {
configMap = (Map<String, Object>) obj;
if (LOG.isDebugEnabled()) {
LOG.debug("building config from " + configMap.toString());
}
confForThisUri = MongoConfigUtil.buildConfiguration(configMap);
inputURI = MongoConfigUtil.getInputURI(confForThisUri);
} catch (ClassCastException e) {
throw new IllegalArgumentException("Invalid JSON format in multi uri config key: each config item must be an " + "object with keys/values describing options for each URI.");
}
MongoSplitter splitter;
Class<? extends MongoSplitter> splitterClass = MongoConfigUtil.getSplitterClass(confForThisUri);
if (splitterClass != null) {
if (LOG.isDebugEnabled()) {
LOG.debug(format("Using custom Splitter class for namespace: %s.%s; hosts: %s", inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
}
//Make sure that the custom class isn't this one
if (splitterClass == MultiMongoCollectionSplitter.class) {
throw new IllegalArgumentException("Can't nest uses of MultiMongoCollectionSplitter");
}
//All clear.
MongoCollectionSplitter collectionSplitter;
collectionSplitter = (MongoCollectionSplitter) ReflectionUtils.newInstance(splitterClass, confForThisUri);
//Since we use no-arg constructor, need to inject
//configuration and input URI.
collectionSplitter.setConfiguration(confForThisUri);
splitter = collectionSplitter;
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(format("Fetching collection stats on namespace: %s.%s; hosts: %s to choose splitter implementation.", inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
}
//No class was specified, so choose one by looking at
//collection stats.
splitter = MongoSplitterFactory.getSplitterByStats(inputURI, confForThisUri);
}
splitters.add(splitter);
}
}
//compile them into one big ol' list.
for (MongoSplitter splitter : splitters) {
returnVal.addAll(splitter.calculateSplits());
}
return returnVal;
}
use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.
the class ShardMongoSplitter method calculateSplits.
// Treat each shard as one split.
@Override
public List<InputSplit> calculateSplits() throws SplitFailedException {
final ArrayList<InputSplit> returnVal = new ArrayList<InputSplit>();
MongoClientURI inputURI = MongoConfigUtil.getInputURI(getConfiguration());
Map<String, List<String>> shardsMap;
try {
shardsMap = getShardsMap();
for (Entry<String, List<String>> entry : shardsMap.entrySet()) {
List<String> shardHosts = entry.getValue();
MongoInputSplit chunkSplit = createSplitFromBounds(null, null);
chunkSplit.setInputURI(rewriteURI(inputURI, shardHosts));
returnVal.add(chunkSplit);
}
} finally {
// getShardsMap() creates a client to a config server. Close it now.
MongoConfigUtil.close(getConfigDB().getMongo());
}
if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
return filterEmptySplits(returnVal);
}
return returnVal;
}
Aggregations