use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class DelegatingInputFormat method getRecordReader.
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException {
// Find the InputFormat and then the RecordReader from the
// TaggedInputSplit.
TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils.newInstance(taggedInputSplit.getInputFormatClass(), conf);
InputSplit inputSplit = taggedInputSplit.getInputSplit();
if (inputSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) inputSplit;
conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString());
conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart());
conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength());
}
return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter);
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class DelegatingInputFormat method getSplits.
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
JobConf confCopy = new JobConf(conf);
List<InputSplit> splits = new ArrayList<>();
Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(conf);
Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(conf);
Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<>();
// First, build a map of InputFormats to Paths
for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
if (!formatPaths.containsKey(entry.getValue().getClass())) {
formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
}
formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
}
for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
Class<? extends InputFormat> formatClass = formatEntry.getKey();
InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
List<Path> paths = formatEntry.getValue();
Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<>();
// a map of Mappers to the paths they're used for
for (Path path : paths) {
Class<? extends Mapper> mapperClass = mapperMap.get(path);
if (!mapperPaths.containsKey(mapperClass)) {
mapperPaths.put(mapperClass, new LinkedList<Path>());
}
mapperPaths.get(mapperClass).add(path);
}
// be added to the same job, and split together.
for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
paths = mapEntry.getValue();
Class<? extends Mapper> mapperClass = mapEntry.getKey();
if (mapperClass == null) {
mapperClass = conf.getMapperClass();
}
FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths.size()]));
// Get splits for each input path and tag with InputFormat
// and Mapper types by wrapping in a TaggedInputSplit.
InputSplit[] pathSplits = format.getSplits(confCopy, numSplits);
for (InputSplit pathSplit : pathSplits) {
splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
}
}
}
return splits.toArray(new InputSplit[splits.size()]);
}
use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.
the class SamplingSortMRInputFormat method writePartitionFile.
/**
* Use the input splits to take samples of the input and generate sample
* keys. By default reads 100,000 keys from 10 locations in the input, sorts
* them and picks N-1 keys to generate N equally sized partitions.
*
* @param conf the job to sample
* @param partFile where to write the output file to
* @return index value
* @throws IOException if something goes wrong
* @throws InstantiationException if InstantiationException occurs
* @throws IllegalAccessException if IllegalAccessException occurs
*/
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
Sampler sampler = new Sampler();
Class<? extends WritableComparable> targetKeyClass;
targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
// get input converter information
int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
// indicate whether the matrix value in this mapper is a matrix cell or a matrix block
int partitions = conf.getNumReduceTasks();
long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
int samples = Math.min(10, splits.length);
long recordsPerSample = sampleSize / samples;
int sampleStep = splits.length / samples;
// take N samples from different parts of the input
int totalcount = 0;
for (int i = 0; i < samples; i++) {
SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
int count = 0;
WritableComparable key = (WritableComparable) reader.createKey();
Writable value = (Writable) reader.createValue();
while (reader.next(key, value) && count < recordsPerSample) {
Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
inputConverter.setBlockSize(brlen, bclen);
inputConverter.convert(key, value);
while (inputConverter.hasNext()) {
Pair pair = inputConverter.next();
if (pair.getKey() instanceof DoubleWritable) {
sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
} else if (pair.getValue() instanceof MatrixCell) {
sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
} else
throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
count++;
}
key = (WritableComparable) reader.createKey();
value = (Writable) reader.createValue();
}
totalcount += count;
}
if (// empty input files
totalcount == 0)
sampler.addValue(new DoubleWritable(0));
FileSystem outFs = partFile.getFileSystem(conf);
if (outFs.exists(partFile)) {
outFs.delete(partFile, false);
}
// note: key value always double/null as expected by partitioner
SequenceFile.Writer writer = null;
int index0 = -1;
try {
writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
NullWritable nullValue = NullWritable.get();
int i = 0;
boolean lessthan0 = true;
for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
writer.append(splitValue, nullValue);
if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
index0 = i;
lessthan0 = false;
}
i++;
}
if (lessthan0)
index0 = partitions - 1;
} finally {
IOUtilFunctions.closeSilently(writer);
}
return index0;
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestOrcSplitElimination method testSplitEliminationComplexExpr.
@Test
public void testSplitEliminationComplexExpr() throws Exception {
ObjectInspector inspector = createIO();
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000);
writeData(writer);
writer.close();
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1000);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 150000);
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
// predicate expression: userid <= 100 and subtype <= 1000.0
GenericUDF udf = new GenericUDFOPEqualOrLessThan();
List<ExprNodeDesc> childExpr = Lists.newArrayList();
ExprNodeColumnDesc col = new ExprNodeColumnDesc(Long.class, "userid", "T", false);
ExprNodeConstantDesc con = new ExprNodeConstantDesc(100);
childExpr.add(col);
childExpr.add(con);
ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
GenericUDF udf1 = new GenericUDFOPEqualOrLessThan();
List<ExprNodeDesc> childExpr1 = Lists.newArrayList();
ExprNodeColumnDesc col1 = new ExprNodeColumnDesc(Double.class, "subtype", "T", false);
ExprNodeConstantDesc con1 = new ExprNodeConstantDesc(1000.0);
childExpr1.add(col1);
childExpr1.add(con1);
ExprNodeGenericFuncDesc en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
GenericUDF udf2 = new GenericUDFOPAnd();
List<ExprNodeDesc> childExpr2 = Lists.newArrayList();
childExpr2.add(en);
childExpr2.add(en1);
ExprNodeGenericFuncDesc en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
String sargStr = SerializationUtilities.serializeExpression(en2);
conf.set("hive.io.filter.expr.serialized", sargStr);
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(2);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
con1 = new ExprNodeConstantDesc(0.0);
childExpr1.set(1, con1);
en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
childExpr2.set(0, en);
childExpr2.set(1, en1);
en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
sargStr = SerializationUtilities.serializeExpression(en2);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// no stripe will satisfy the predicate
assertEquals(0, splits.length);
con = new ExprNodeConstantDesc(2);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
con1 = new ExprNodeConstantDesc(1.0);
childExpr1.set(1, con1);
en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
childExpr2.set(0, en);
childExpr2.set(1, en1);
en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
sargStr = SerializationUtilities.serializeExpression(en2);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// only first stripe will satisfy condition and hence single split
assertEquals(1, splits.length);
udf = new GenericUDFOPEqual();
con = new ExprNodeConstantDesc(13);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
con1 = new ExprNodeConstantDesc(80.0);
childExpr1.set(1, con1);
en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
childExpr2.set(0, en);
childExpr2.set(1, en1);
en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
sargStr = SerializationUtilities.serializeExpression(en2);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// first two stripes will satisfy condition and hence single split
assertEquals(2, splits.length);
udf = new GenericUDFOPEqual();
con = new ExprNodeConstantDesc(13);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
udf1 = new GenericUDFOPEqual();
con1 = new ExprNodeConstantDesc(80.0);
childExpr1.set(1, con1);
en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
childExpr2.set(0, en);
childExpr2.set(1, en1);
en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
sargStr = SerializationUtilities.serializeExpression(en2);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// only second stripes will satisfy condition and hence single split
assertEquals(1, splits.length);
}
use of org.apache.hadoop.mapred.InputSplit in project hive by apache.
the class TestOrcSplitElimination method testSplitEliminationSmallMaxSplit.
@Test
public void testSplitEliminationSmallMaxSplit() throws Exception {
ObjectInspector inspector = createIO();
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000);
writeData(writer);
writer.close();
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1000);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 5000);
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
GenericUDF udf = new GenericUDFOPEqualOrLessThan();
List<ExprNodeDesc> childExpr = Lists.newArrayList();
ExprNodeConstantDesc con;
ExprNodeGenericFuncDesc en;
String sargStr;
createTestSarg(inspector, udf, childExpr);
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(5, splits.length);
con = new ExprNodeConstantDesc(1);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(0, splits.length);
con = new ExprNodeConstantDesc(2);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(1, splits.length);
con = new ExprNodeConstantDesc(5);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(13);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(3, splits.length);
con = new ExprNodeConstantDesc(29);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(4, splits.length);
con = new ExprNodeConstantDesc(70);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(5, splits.length);
}
Aggregations