use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class DelegatingInputFormat method getSplits.
@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
Job jobCopy = Job.getInstance(conf);
List<InputSplit> splits = new ArrayList<InputSplit>();
Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(job);
Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(job);
Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>();
// First, build a map of InputFormats to Paths
for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
if (!formatPaths.containsKey(entry.getValue().getClass())) {
formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
}
formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
}
for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
Class<? extends InputFormat> formatClass = formatEntry.getKey();
InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
List<Path> paths = formatEntry.getValue();
Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<Class<? extends Mapper>, List<Path>>();
// a map of Mappers to the paths they're used for
for (Path path : paths) {
Class<? extends Mapper> mapperClass = mapperMap.get(path);
if (!mapperPaths.containsKey(mapperClass)) {
mapperPaths.put(mapperClass, new LinkedList<Path>());
}
mapperPaths.get(mapperClass).add(path);
}
// be added to the same job, and split together.
for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
paths = mapEntry.getValue();
Class<? extends Mapper> mapperClass = mapEntry.getKey();
if (mapperClass == null) {
try {
mapperClass = job.getMapperClass();
} catch (ClassNotFoundException e) {
throw new IOException("Mapper class is not found", e);
}
}
FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));
// Get splits for each input path and tag with InputFormat
// and Mapper types by wrapping in a TaggedInputSplit.
List<InputSplit> pathSplits = format.getSplits(jobCopy);
for (InputSplit pathSplit : pathSplits) {
splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
}
}
}
return splits;
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class BigDecimalSplitter method split.
public List<InputSplit> split(Configuration conf, ResultSet results, String colName) throws SQLException {
BigDecimal minVal = results.getBigDecimal(1);
BigDecimal maxVal = results.getBigDecimal(2);
String lowClausePrefix = colName + " >= ";
String highClausePrefix = colName + " < ";
BigDecimal numSplits = new BigDecimal(conf.getInt(MRJobConfig.NUM_MAPS, 1));
if (minVal == null && maxVal == null) {
// Range is null to null. Return a null split accordingly.
List<InputSplit> splits = new ArrayList<InputSplit>();
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(colName + " IS NULL", colName + " IS NULL"));
return splits;
}
if (minVal == null || maxVal == null) {
// Don't know what is a reasonable min/max value for interpolation. Fail.
LOG.error("Cannot find a range for NUMERIC or DECIMAL fields with one end NULL.");
return null;
}
// Get all the split points together.
List<BigDecimal> splitPoints = split(numSplits, minVal, maxVal);
List<InputSplit> splits = new ArrayList<InputSplit>();
// Turn the split points into a set of intervals.
BigDecimal start = splitPoints.get(0);
for (int i = 1; i < splitPoints.size(); i++) {
BigDecimal end = splitPoints.get(i);
if (i == splitPoints.size() - 1) {
// This is the last one; use a closed interval.
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(lowClausePrefix + start.toString(), colName + " <= " + end.toString()));
} else {
// Normal open-interval case.
splits.add(new DataDrivenDBInputFormat.DataDrivenDBInputSplit(lowClausePrefix + start.toString(), highClausePrefix + end.toString()));
}
start = end;
}
return splits;
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class DBInputFormat method getSplits.
/** {@inheritDoc} */
public List<InputSplit> getSplits(JobContext job) throws IOException {
ResultSet results = null;
Statement statement = null;
try {
statement = connection.createStatement();
results = statement.executeQuery(getCountQuery());
results.next();
long count = results.getLong(1);
int chunks = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
long chunkSize = (count / chunks);
results.close();
statement.close();
List<InputSplit> splits = new ArrayList<InputSplit>();
// accordingly
for (int i = 0; i < chunks; i++) {
DBInputSplit split;
if ((i + 1) == chunks)
split = new DBInputSplit(i * chunkSize, count);
else
split = new DBInputSplit(i * chunkSize, (i * chunkSize) + chunkSize);
splits.add(split);
}
connection.commit();
return splits;
} catch (SQLException e) {
throw new IOException("Got SQLException", e);
} finally {
try {
if (results != null) {
results.close();
}
} catch (SQLException e1) {
}
try {
if (statement != null) {
statement.close();
}
} catch (SQLException e1) {
}
closeConnection();
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class DataDrivenDBInputFormat method getSplits.
/** {@inheritDoc} */
public List<InputSplit> getSplits(JobContext job) throws IOException {
int targetNumTasks = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
if (1 == targetNumTasks) {
// There's no need to run a bounding vals query; just return a split
// that separates nothing. This can be considerably more optimal for a
// large table with no index.
List<InputSplit> singletonSplit = new ArrayList<InputSplit>();
singletonSplit.add(new DataDrivenDBInputSplit("1=1", "1=1"));
return singletonSplit;
}
ResultSet results = null;
Statement statement = null;
try {
statement = connection.createStatement();
results = statement.executeQuery(getBoundingValsQuery());
results.next();
// Based on the type of the results, use a different mechanism
// for interpolating split points (i.e., numeric splits, text splits,
// dates, etc.)
int sqlDataType = results.getMetaData().getColumnType(1);
DBSplitter splitter = getSplitter(sqlDataType);
if (null == splitter) {
throw new IOException("Unknown SQL data type: " + sqlDataType);
}
return splitter.split(job.getConfiguration(), results, getDBConf().getInputOrderBy());
} catch (SQLException e) {
throw new IOException(e.getMessage());
} finally {
// More-or-less ignore SQL exceptions here, but log in case we need it.
try {
if (null != results) {
results.close();
}
} catch (SQLException se) {
LOG.debug("SQLException closing resultset: " + se.toString());
}
try {
if (null != statement) {
statement.close();
}
} catch (SQLException se) {
LOG.debug("SQLException closing statement: " + se.toString());
}
try {
connection.commit();
closeConnection();
} catch (SQLException se) {
LOG.debug("SQLException committing split transaction: " + se.toString());
}
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestFileInputFormat method testNumInputFilesRecursively.
@Test
public void testNumInputFilesRecursively() throws Exception {
Configuration conf = getConfiguration();
conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
Job job = Job.getInstance(conf);
FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
List<InputSplit> splits = fileInputFormat.getSplits(job);
Assert.assertEquals("Input splits are not correct", 3, splits.size());
verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3", "test:/a1/file1"), splits);
// Using the deprecated configuration
conf = getConfiguration();
conf.set("mapred.input.dir.recursive", "true");
job = Job.getInstance(conf);
splits = fileInputFormat.getSplits(job);
verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3", "test:/a1/file1"), splits);
}
Aggregations