Search in sources :

Example 96 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.

the class BSONFileInputFormat method getSplits.

@Override
public FileSplit[] getSplits(final JobConf job, final int numSplits) throws IOException {
    BSONSplitter splitter = new BSONSplitter();
    splitter.setConf(job);
    FileStatus[] inputFiles = listStatus(job);
    List<FileSplit> results = new ArrayList<FileSplit>();
    for (FileStatus file : inputFiles) {
        FileSystem fs = FileSystem.get(file.getPath().toUri(), job);
        if (!isSplitable(fs, file.getPath())) {
            LOG.info("File " + file.getPath() + " is compressed so " + "cannot be split.");
            org.apache.hadoop.mapreduce.lib.input.FileSplit delegate = splitter.createFileSplit(file, fs, 0L, file.getLen());
            results.add(new BSONFileSplit(delegate.getPath(), delegate.getStart(), delegate.getLength(), delegate.getLocations()));
            continue;
        }
        splitter.setInputPath(file.getPath());
        Path splitFilePath = getSplitsFilePath(file.getPath(), job);
        try {
            splitter.loadSplitsFromSplitFile(file, splitFilePath);
        } catch (BSONSplitter.NoSplitFileException nsfe) {
            if (LOG.isDebugEnabled()) {
                LOG.debug(format("No split file for %s; building split file", file.getPath()));
            }
            splitter.readSplitsForFile(file);
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug(format("BSONSplitter found %d splits.", splitter.getAllSplits().size()));
        }
        for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : splitter.getAllSplits()) {
            BSONFileSplit fsplit = new BSONFileSplit(split.getPath(), split.getStart(), split.getLength(), split.getLocations());
            fsplit.setKeyField(MongoConfigUtil.getInputKey(job));
            results.add(fsplit);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug(format("Total of %d found.", results.size()));
    }
    return results.toArray(new BSONFileSplit[results.size()]);
}
Also used : BSONSplitter.getSplitsFilePath(com.mongodb.hadoop.splitter.BSONSplitter.getSplitsFilePath) Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) BSONFileSplit(com.mongodb.hadoop.mapred.input.BSONFileSplit) BSONSplitter(com.mongodb.hadoop.splitter.BSONSplitter) BSONFileSplit(com.mongodb.hadoop.mapred.input.BSONFileSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 97 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.

the class HiveMongoInputFormat method getSplits.

@Override
public FileSplit[] getSplits(final JobConf conf, final int numSplits) throws IOException {
    try {
        MongoSplitter splitterImpl = MongoSplitterFactory.getSplitter(conf);
        final List<org.apache.hadoop.mapreduce.InputSplit> splits = splitterImpl.calculateSplits();
        InputSplit[] splitIns = splits.toArray(new InputSplit[splits.size()]);
        // wrap InputSplits in FileSplits so that 'getPath'
        // doesn't produce an error (Hive bug)
        FileSplit[] wrappers = new FileSplit[splitIns.length];
        Path path = new Path(conf.get(MongoStorageHandler.TABLE_LOCATION));
        for (int i = 0; i < wrappers.length; i++) {
            wrappers[i] = new MongoHiveInputSplit(splitIns[i], path);
        }
        return wrappers;
    } catch (SplitFailedException spfe) {
        // split failed because no namespace found
        // (so the corresponding collection doesn't exist)
        LOG.error(spfe.getMessage(), spfe);
        throw new IOException(spfe.getMessage(), spfe);
    } catch (Exception e) {
        throw new IOException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MongoSplitter(com.mongodb.hadoop.splitter.MongoSplitter) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapred.FileSplit) SplitFailedException(com.mongodb.hadoop.splitter.SplitFailedException) IOException(java.io.IOException) SplitFailedException(com.mongodb.hadoop.splitter.SplitFailedException) MongoInputSplit(com.mongodb.hadoop.input.MongoInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 98 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.

the class BSONFileInputFormatTest method enronEmails.

@Test
public void enronEmails() throws IOException {
    BSONFileInputFormat inputFormat = new BSONFileInputFormat();
    JobConf job = new JobConf();
    String inputDirectory = new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson").getAbsoluteFile().toURI().toString();
    // Hadoop 2.X
    job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
    // Hadoop 1.2.X
    job.set("mapred.input.dir", inputDirectory);
    FileSplit[] splits = inputFormat.getSplits(job, 5);
    int count = 0;
    BSONWritable writable = new BSONWritable();
    for (FileSplit split : splits) {
        RecordReader<NullWritable, BSONWritable> recordReader = inputFormat.getRecordReader(split, job, null);
        while (recordReader.next(null, writable)) {
            count++;
        }
    }
    assertEquals("There are 501513 messages in the enron corpus", 501513, count);
}
Also used : BSONWritable(com.mongodb.hadoop.io.BSONWritable) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) File(java.io.File) NullWritable(org.apache.hadoop.io.NullWritable) Test(org.junit.Test)

Example 99 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project mongo-hadoop by mongodb.

the class BSONFileInputFormatTest method enronEmails.

@Test
public void enronEmails() throws IOException {
    BSONFileInputFormat inputFormat = new BSONFileInputFormat();
    JobConf job = new JobConf();
    String inputDirectory = new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson").getAbsoluteFile().toURI().toString();
    // Hadoop 2.X
    job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
    // Hadoop 1.2.X
    job.set("mapred.input.dir", inputDirectory);
    FileSplit[] splits = inputFormat.getSplits(job, 5);
    int count = 0;
    BSONWritable writable = new BSONWritable();
    for (FileSplit split : splits) {
        RecordReader<NullWritable, BSONWritable> recordReader = inputFormat.getRecordReader(split, job, null);
        while (recordReader.next(null, writable)) {
            count++;
        }
    }
    assertEquals("There are 501513 messages in the enron corpus", 501513, count);
}
Also used : BSONWritable(com.mongodb.hadoop.io.BSONWritable) BSONFileInputFormat(com.mongodb.hadoop.mapred.BSONFileInputFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) File(java.io.File) NullWritable(org.apache.hadoop.io.NullWritable) Test(org.junit.Test)

Example 100 with FileSplit

use of org.apache.hadoop.mapred.FileSplit in project drill by apache.

the class HiveMetadataProvider method transformFileSplits.

/**
 * <p>
 * Groups input splits by file path. Each inout split group is ordered by starting bytes
 * to ensure file parts in correct order.
 * </p>
 * <p>
 * Example:
 * <pre>
 * hdfs:///tmp/table/file_1.txt  -> hdfs:///tmp/table/file_1.txt:0+10000
 *                                  hdfs:///tmp/table/file_1.txt:10001+20000
 * hdfs:///tmp/table/file_2.txt  -> hdfs:///tmp/table/file_2.txt:0+10000
 * </pre>
 * </p>
 * @param inputSplits input splits
 * @return multimap where key is file path and value is group of ordered file splits
 */
private Multimap<Path, FileSplit> transformFileSplits(InputSplit[] inputSplits) {
    Multimap<Path, FileSplit> inputSplitGroups = TreeMultimap.create(Ordering.natural(), Comparator.comparingLong(FileSplit::getStart));
    for (InputSplit inputSplit : inputSplits) {
        FileSplit fileSplit = (FileSplit) inputSplit;
        inputSplitGroups.put(fileSplit.getPath(), fileSplit);
    }
    return inputSplitGroups;
}
Also used : Path(org.apache.hadoop.fs.Path) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

FileSplit (org.apache.hadoop.mapred.FileSplit)101 Path (org.apache.hadoop.fs.Path)57 InputSplit (org.apache.hadoop.mapred.InputSplit)34 JobConf (org.apache.hadoop.mapred.JobConf)25 IOException (java.io.IOException)19 Configuration (org.apache.hadoop.conf.Configuration)17 File (java.io.File)16 FileStatus (org.apache.hadoop.fs.FileStatus)13 FileSystem (org.apache.hadoop.fs.FileSystem)13 Test (org.junit.Test)12 ArrayList (java.util.ArrayList)10 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)10 Properties (java.util.Properties)9 RecordReader (org.apache.hadoop.mapred.RecordReader)9 Test (org.testng.annotations.Test)9 List (java.util.List)8 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)8 RecordCursor (com.facebook.presto.spi.RecordCursor)6 ImmutableList (com.google.common.collect.ImmutableList)6 Iterables.filter (com.google.common.collect.Iterables.filter)6