Search in sources :

Example 46 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class GridFSInputFormatTest method testReadWholeFile.

@Test
public void testReadWholeFile() throws IOException, InterruptedException {
    Configuration conf = getConfiguration();
    MongoConfigUtil.setGridFSWholeFileSplit(conf, true);
    JobContext jobContext = mockJobContext(conf);
    List<InputSplit> splits = inputFormat.getSplits(jobContext);
    // Empty delimiter == no delimiter.
    MongoConfigUtil.setGridFSDelimiterPattern(conf, "#+");
    TaskAttemptContext context = mockTaskAttemptContext(conf);
    assertEquals(1, splits.size());
    List<String> sections = new ArrayList<String>();
    for (InputSplit split : splits) {
        GridFSInputFormat.GridFSTextRecordReader reader = new GridFSInputFormat.GridFSTextRecordReader();
        reader.initialize(split, context);
        int i;
        for (i = 0; reader.nextKeyValue(); ++i) {
            sections.add(reader.getCurrentValue().toString());
        }
    }
    assertEquals(Arrays.asList(readmeSections), sections);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) JobContext(org.apache.hadoop.mapreduce.JobContext) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test) BaseHadoopTest(com.mongodb.hadoop.testutils.BaseHadoopTest)

Example 47 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class GridFSInputFormatTest method testReadWholeFileNoDelimiter.

@Test
public void testReadWholeFileNoDelimiter() throws IOException, InterruptedException {
    Configuration conf = getConfiguration();
    MongoConfigUtil.setGridFSWholeFileSplit(conf, true);
    JobContext jobContext = mockJobContext(conf);
    List<InputSplit> splits = inputFormat.getSplits(jobContext);
    // Empty delimiter == no delimiter.
    MongoConfigUtil.setGridFSDelimiterPattern(conf, "");
    TaskAttemptContext context = mockTaskAttemptContext(conf);
    assertEquals(1, splits.size());
    String fileText = null;
    for (InputSplit split : splits) {
        GridFSInputFormat.GridFSTextRecordReader reader = new GridFSInputFormat.GridFSTextRecordReader();
        reader.initialize(split, context);
        int i;
        for (i = 0; reader.nextKeyValue(); ++i) {
            fileText = reader.getCurrentValue().toString();
        }
        assertEquals(1, i);
    }
    assertEquals(fileContents.toString(), fileText);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) JobContext(org.apache.hadoop.mapreduce.JobContext) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test) BaseHadoopTest(com.mongodb.hadoop.testutils.BaseHadoopTest)

Example 48 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class GridFSInputFormatTest method testRecordReaderNoDelimiter.

@Test
public void testRecordReaderNoDelimiter() throws IOException, InterruptedException {
    List<InputSplit> splits = getSplits();
    Configuration conf = getConfiguration();
    // Empty delimiter == no delimiter.
    MongoConfigUtil.setGridFSDelimiterPattern(conf, "");
    TaskAttemptContext context = mockTaskAttemptContext(conf);
    StringBuilder fileText = new StringBuilder();
    for (InputSplit split : splits) {
        GridFSInputFormat.GridFSTextRecordReader reader = new GridFSInputFormat.GridFSTextRecordReader();
        reader.initialize(split, context);
        while (reader.nextKeyValue()) {
            fileText.append(reader.getCurrentValue().toString());
        }
    }
    assertEquals(fileContents.toString(), fileText.toString());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test) BaseHadoopTest(com.mongodb.hadoop.testutils.BaseHadoopTest)

Example 49 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class MultiMongoCollectionSplitter method calculateSplits.

@Override
public List<InputSplit> calculateSplits() throws SplitFailedException {
    List<MongoClientURI> inputURIs = MongoConfigUtil.getMongoURIs(this.getConfiguration(), MongoConfigUtil.INPUT_URI);
    List<InputSplit> returnVal = new LinkedList<InputSplit>();
    List<MongoSplitter> splitters = new LinkedList<MongoSplitter>();
    //splitter for each implementation.
    if (inputURIs.size() > 0) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Using global split settings for multiple URIs specified.");
        }
        //the configuration instead.
        for (MongoClientURI uri : inputURIs) {
            MongoCollectionSplitter splitter;
            Configuration confForThisUri = new Configuration(getConfiguration());
            MongoConfigUtil.setInputURI(confForThisUri, uri);
            confForThisUri.set(MongoConfigUtil.MONGO_SPLITTER_CLASS, "");
            splitter = MongoSplitterFactory.getSplitterByStats(uri, confForThisUri);
            splitters.add(splitter);
        }
    } else {
        //Otherwise the user has set options per-collection.
        if (LOG.isDebugEnabled()) {
            LOG.debug("Loading multiple input URIs from JSON stored in " + MULTI_COLLECTION_CONF_KEY);
        }
        DBObject multiUriConfig = MongoConfigUtil.getDBObject(this.getConfiguration(), MULTI_COLLECTION_CONF_KEY);
        if (!(multiUriConfig instanceof List)) {
            throw new IllegalArgumentException("Invalid JSON format in multi uri config key: Must be an array where each element " + "is an object describing the URI and config options for each split.");
        }
        for (Object obj : (List) multiUriConfig) {
            Map<String, Object> configMap;
            MongoClientURI inputURI;
            Configuration confForThisUri;
            try {
                configMap = (Map<String, Object>) obj;
                if (LOG.isDebugEnabled()) {
                    LOG.debug("building config from " + configMap.toString());
                }
                confForThisUri = MongoConfigUtil.buildConfiguration(configMap);
                inputURI = MongoConfigUtil.getInputURI(confForThisUri);
            } catch (ClassCastException e) {
                throw new IllegalArgumentException("Invalid JSON format in multi uri config key: each config item must be an " + "object with keys/values describing options for each URI.");
            }
            MongoSplitter splitter;
            Class<? extends MongoSplitter> splitterClass = MongoConfigUtil.getSplitterClass(confForThisUri);
            if (splitterClass != null) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug(format("Using custom Splitter class for namespace: %s.%s; hosts: %s", inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
                }
                //Make sure that the custom class isn't this one
                if (splitterClass == MultiMongoCollectionSplitter.class) {
                    throw new IllegalArgumentException("Can't nest uses of MultiMongoCollectionSplitter");
                }
                //All clear.
                MongoCollectionSplitter collectionSplitter;
                collectionSplitter = (MongoCollectionSplitter) ReflectionUtils.newInstance(splitterClass, confForThisUri);
                //Since we use no-arg constructor, need to inject
                //configuration and input URI.
                collectionSplitter.setConfiguration(confForThisUri);
                splitter = collectionSplitter;
            } else {
                if (LOG.isDebugEnabled()) {
                    LOG.debug(format("Fetching collection stats on namespace: %s.%s; hosts: %s to choose splitter implementation.", inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
                }
                //No class was specified, so choose one by looking at
                //collection stats.
                splitter = MongoSplitterFactory.getSplitterByStats(inputURI, confForThisUri);
            }
            splitters.add(splitter);
        }
    }
    //compile them into one big ol' list.
    for (MongoSplitter splitter : splitters) {
        returnVal.addAll(splitter.calculateSplits());
    }
    return returnVal;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) MongoClientURI(com.mongodb.MongoClientURI) DBObject(com.mongodb.DBObject) LinkedList(java.util.LinkedList) List(java.util.List) LinkedList(java.util.LinkedList) DBObject(com.mongodb.DBObject) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 50 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project mongo-hadoop by mongodb.

the class ShardMongoSplitter method calculateSplits.

// Treat each shard as one split.
@Override
public List<InputSplit> calculateSplits() throws SplitFailedException {
    final ArrayList<InputSplit> returnVal = new ArrayList<InputSplit>();
    MongoClientURI inputURI = MongoConfigUtil.getInputURI(getConfiguration());
    Map<String, List<String>> shardsMap;
    try {
        shardsMap = getShardsMap();
        for (Entry<String, List<String>> entry : shardsMap.entrySet()) {
            List<String> shardHosts = entry.getValue();
            MongoInputSplit chunkSplit = createSplitFromBounds(null, null);
            chunkSplit.setInputURI(rewriteURI(inputURI, shardHosts));
            returnVal.add(chunkSplit);
        }
    } finally {
        // getShardsMap() creates a client to a config server. Close it now.
        MongoConfigUtil.close(getConfigDB().getMongo());
    }
    if (MongoConfigUtil.isFilterEmptySplitsEnabled(getConfiguration())) {
        return filterEmptySplits(returnVal);
    }
    return returnVal;
}
Also used : MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) MongoClientURI(com.mongodb.MongoClientURI) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9