Search in sources :

Example 96 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class DelegatingInputFormat method getSplits.

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    Job jobCopy = Job.getInstance(conf);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(job);
    Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(job);
    Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>();
    // First, build a map of InputFormats to Paths
    for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
        if (!formatPaths.containsKey(entry.getValue().getClass())) {
            formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
        }
        formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
    }
    for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
        Class<? extends InputFormat> formatClass = formatEntry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        List<Path> paths = formatEntry.getValue();
        Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<Class<? extends Mapper>, List<Path>>();
        // a map of Mappers to the paths they're used for
        for (Path path : paths) {
            Class<? extends Mapper> mapperClass = mapperMap.get(path);
            if (!mapperPaths.containsKey(mapperClass)) {
                mapperPaths.put(mapperClass, new LinkedList<Path>());
            }
            mapperPaths.get(mapperClass).add(path);
        }
        // be added to the same job, and split together.
        for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
            paths = mapEntry.getValue();
            Class<? extends Mapper> mapperClass = mapEntry.getKey();
            if (mapperClass == null) {
                try {
                    mapperClass = job.getMapperClass();
                } catch (ClassNotFoundException e) {
                    throw new IOException("Mapper class is not found", e);
                }
            }
            FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));
            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            List<InputSplit> pathSplits = format.getSplits(jobCopy);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
            }
        }
    }
    return splits;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Mapper(org.apache.hadoop.mapreduce.Mapper) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) InputFormat(org.apache.hadoop.mapreduce.InputFormat)

Example 97 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class BigDecimalSplitter method split.

public List<InputSplit> split(Configuration conf, ResultSet results, String colName) throws SQLException {
    BigDecimal minVal = results.getBigDecimal(1);
    BigDecimal maxVal = results.getBigDecimal(2);
    String lowClausePrefix = colName + " >= ";
    String highClausePrefix = colName + " < ";
    BigDecimal numSplits = new BigDecimal(conf.getInt(MRJobConfig.NUM_MAPS, 1));
    if (minVal == null && maxVal == null) {
        // Range is null to null. Return a null split accordingly.
        List<InputSplit> splits = new ArrayList<InputSplit>();
        splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(colName + " IS NULL", colName + " IS NULL"));
        return splits;
    }
    if (minVal == null || maxVal == null) {
        // Don't know what is a reasonable min/max value for interpolation. Fail.
        LOG.error("Cannot find a range for NUMERIC or DECIMAL fields with one end NULL.");
        return null;
    }
    // Get all the split points together.
    List<BigDecimal> splitPoints = split(numSplits, minVal, maxVal);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    // Turn the split points into a set of intervals.
    BigDecimal start = splitPoints.get(0);
    for (int i = 1; i < splitPoints.size(); i++) {
        BigDecimal end = splitPoints.get(i);
        if (i == splitPoints.size() - 1) {
            // This is the last one; use a closed interval.
            splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(lowClausePrefix + start.toString(), colName + " <= " + end.toString()));
        } else {
            // Normal open-interval case.
            splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(lowClausePrefix + start.toString(), highClausePrefix + end.toString()));
        }
        start = end;
    }
    return splits;
}
Also used : ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit) BigDecimal(java.math.BigDecimal)

Example 98 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class DBInputFormat method getSplits.

/** {@inheritDoc} */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    ResultSet results = null;
    Statement statement = null;
    try {
        statement = connection.createStatement();
        results = statement.executeQuery(getCountQuery());
        results.next();
        long count = results.getLong(1);
        int chunks = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
        long chunkSize = (count / chunks);
        results.close();
        statement.close();
        List<InputSplit> splits = new ArrayList<InputSplit>();
        // accordingly
        for (int i = 0; i < chunks; i++) {
            DBInputSplit split;
            if ((i + 1) == chunks)
                split = new DBInputSplit(i * chunkSize, count);
            else
                split = new DBInputSplit(i * chunkSize, (i * chunkSize) + chunkSize);
            splits.add(split);
        }
        connection.commit();
        return splits;
    } catch (SQLException e) {
        throw new IOException("Got SQLException", e);
    } finally {
        try {
            if (results != null) {
                results.close();
            }
        } catch (SQLException e1) {
        }
        try {
            if (statement != null) {
                statement.close();
            }
        } catch (SQLException e1) {
        }
        closeConnection();
    }
}
Also used : SQLException(java.sql.SQLException) PreparedStatement(java.sql.PreparedStatement) Statement(java.sql.Statement) ResultSet(java.sql.ResultSet) ArrayList(java.util.ArrayList) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 99 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class DataDrivenDBInputFormat method getSplits.

/** {@inheritDoc} */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    int targetNumTasks = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
    if (1 == targetNumTasks) {
        // There's no need to run a bounding vals query; just return a split
        // that separates nothing. This can be considerably more optimal for a
        // large table with no index.
        List<InputSplit> singletonSplit = new ArrayList<InputSplit>();
        singletonSplit.add(new DataDrivenDBInputSplit("1=1", "1=1"));
        return singletonSplit;
    }
    ResultSet results = null;
    Statement statement = null;
    try {
        statement = connection.createStatement();
        results = statement.executeQuery(getBoundingValsQuery());
        results.next();
        // Based on the type of the results, use a different mechanism
        // for interpolating split points (i.e., numeric splits, text splits,
        // dates, etc.)
        int sqlDataType = results.getMetaData().getColumnType(1);
        DBSplitter splitter = getSplitter(sqlDataType);
        if (null == splitter) {
            throw new IOException("Unknown SQL data type: " + sqlDataType);
        }
        return splitter.split(job.getConfiguration(), results, getDBConf().getInputOrderBy());
    } catch (SQLException e) {
        throw new IOException(e.getMessage());
    } finally {
        // More-or-less ignore SQL exceptions here, but log in case we need it.
        try {
            if (null != results) {
                results.close();
            }
        } catch (SQLException se) {
            LOG.debug("SQLException closing resultset: " + se.toString());
        }
        try {
            if (null != statement) {
                statement.close();
            }
        } catch (SQLException se) {
            LOG.debug("SQLException closing statement: " + se.toString());
        }
        try {
            connection.commit();
            closeConnection();
        } catch (SQLException se) {
            LOG.debug("SQLException committing split transaction: " + se.toString());
        }
    }
}
Also used : SQLException(java.sql.SQLException) PreparedStatement(java.sql.PreparedStatement) Statement(java.sql.Statement) ArrayList(java.util.ArrayList) ResultSet(java.sql.ResultSet) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 100 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestFileInputFormat method testNumInputFilesRecursively.

@Test
public void testNumInputFilesRecursively() throws Exception {
    Configuration conf = getConfiguration();
    conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
    conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
    Job job = Job.getInstance(conf);
    FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
    List<InputSplit> splits = fileInputFormat.getSplits(job);
    Assert.assertEquals("Input splits are not correct", 3, splits.size());
    verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3", "test:/a1/file1"), splits);
    // Using the deprecated configuration
    conf = getConfiguration();
    conf.set("mapred.input.dir.recursive", "true");
    job = Job.getInstance(conf);
    splits = fileInputFormat.getSplits(job);
    verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3", "test:/a1/file1"), splits);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9