Examples with InputSplit - org.apache.hadoop.mapred.InputSplit

Example 86 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class DelegatingInputFormat method getRecordReader.

@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException {
    // Find the InputFormat and then the RecordReader from the
    // TaggedInputSplit.
    TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
    InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils.newInstance(taggedInputSplit.getInputFormatClass(), conf);
    InputSplit inputSplit = taggedInputSplit.getInputSplit();
    if (inputSplit instanceof FileSplit) {
        FileSplit fileSplit = (FileSplit) inputSplit;
        conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString());
        conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart());
        conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength());
    }
    return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter);
}

Also used : FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 87 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class DelegatingInputFormat method getSplits.

public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    JobConf confCopy = new JobConf(conf);
    List<InputSplit> splits = new ArrayList<>();
    Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(conf);
    Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(conf);
    Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<>();
    // First, build a map of InputFormats to Paths
    for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
        if (!formatPaths.containsKey(entry.getValue().getClass())) {
            formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
        }
        formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
    }
    for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
        Class<? extends InputFormat> formatClass = formatEntry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        List<Path> paths = formatEntry.getValue();
        Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<>();
        // a map of Mappers to the paths they're used for
        for (Path path : paths) {
            Class<? extends Mapper> mapperClass = mapperMap.get(path);
            if (!mapperPaths.containsKey(mapperClass)) {
                mapperPaths.put(mapperClass, new LinkedList<Path>());
            }
            mapperPaths.get(mapperClass).add(path);
        }
        // be added to the same job, and split together.
        for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
            paths = mapEntry.getValue();
            Class<? extends Mapper> mapperClass = mapEntry.getKey();
            if (mapperClass == null) {
                mapperClass = conf.getMapperClass();
            }
            FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths.size()]));
            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            InputSplit[] pathSplits = format.getSplits(confCopy, numSplits);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
            }
        }
    }
    return splits.toArray(new InputSplit[splits.size()]);
}

Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Mapper(org.apache.hadoop.mapred.Mapper) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 88 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class SamplingSortMRInputFormat method writePartitionFile.

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 *
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @return index value
 * @throws IOException if something goes wrong
 * @throws InstantiationException if InstantiationException occurs
 * @throws IllegalAccessException if IllegalAccessException occurs
 */
@SuppressWarnings({ "unchecked", "unused", "deprecation" })
public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException {
    SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat();
    Sampler sampler = new Sampler();
    Class<? extends WritableComparable> targetKeyClass;
    targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class);
    // get input converter information
    int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0);
    int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0);
    // indicate whether the matrix value in this mapper is a matrix cell or a matrix block
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 1000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    // take N samples from different parts of the input
    int totalcount = 0;
    for (int i = 0; i < samples; i++) {
        SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        int count = 0;
        WritableComparable key = (WritableComparable) reader.createKey();
        Writable value = (Writable) reader.createValue();
        while (reader.next(key, value) && count < recordsPerSample) {
            Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0);
            inputConverter.setBlockSize(brlen, bclen);
            inputConverter.convert(key, value);
            while (inputConverter.hasNext()) {
                Pair pair = inputConverter.next();
                if (pair.getKey() instanceof DoubleWritable) {
                    sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get()));
                } else if (pair.getValue() instanceof MatrixCell) {
                    sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue()));
                } else
                    throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass());
                count++;
            }
            key = (WritableComparable) reader.createKey();
            value = (Writable) reader.createValue();
        }
        totalcount += count;
    }
    if (// empty input files
    totalcount == 0)
        sampler.addValue(new DoubleWritable(0));
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    // note: key value always double/null as expected by partitioner
    SequenceFile.Writer writer = null;
    int index0 = -1;
    try {
        writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class);
        NullWritable nullValue = NullWritable.get();
        int i = 0;
        boolean lessthan0 = true;
        for (WritableComparable splitValue : sampler.createPartitions(partitions)) {
            writer.append(splitValue, nullValue);
            if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) {
                index0 = i;
                lessthan0 = false;
            }
            i++;
        }
        if (lessthan0)
            index0 = partitions - 1;
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
    return index0;
}

Also used : SequenceFileRecordReader(org.apache.hadoop.mapred.SequenceFileRecordReader) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) SequenceFile(org.apache.hadoop.io.SequenceFile) WritableComparable(org.apache.hadoop.io.WritableComparable) FileSystem(org.apache.hadoop.fs.FileSystem) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Converter(org.apache.sysml.runtime.matrix.data.Converter) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 89 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestOrcSplitElimination method testSplitEliminationComplexExpr.

@Test
public void testSplitEliminationComplexExpr() throws Exception {
    ObjectInspector inspector = createIO();
    Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000);
    writeData(writer);
    writer.close();
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1000);
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 150000);
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    // predicate expression: userid <= 100 and subtype <= 1000.0
    GenericUDF udf = new GenericUDFOPEqualOrLessThan();
    List<ExprNodeDesc> childExpr = Lists.newArrayList();
    ExprNodeColumnDesc col = new ExprNodeColumnDesc(Long.class, "userid", "T", false);
    ExprNodeConstantDesc con = new ExprNodeConstantDesc(100);
    childExpr.add(col);
    childExpr.add(con);
    ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    GenericUDF udf1 = new GenericUDFOPEqualOrLessThan();
    List<ExprNodeDesc> childExpr1 = Lists.newArrayList();
    ExprNodeColumnDesc col1 = new ExprNodeColumnDesc(Double.class, "subtype", "T", false);
    ExprNodeConstantDesc con1 = new ExprNodeConstantDesc(1000.0);
    childExpr1.add(col1);
    childExpr1.add(con1);
    ExprNodeGenericFuncDesc en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
    GenericUDF udf2 = new GenericUDFOPAnd();
    List<ExprNodeDesc> childExpr2 = Lists.newArrayList();
    childExpr2.add(en);
    childExpr2.add(en1);
    ExprNodeGenericFuncDesc en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
    String sargStr = SerializationUtilities.serializeExpression(en2);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(2, splits.length);
    con = new ExprNodeConstantDesc(2);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    con1 = new ExprNodeConstantDesc(0.0);
    childExpr1.set(1, con1);
    en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
    childExpr2.set(0, en);
    childExpr2.set(1, en1);
    en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
    sargStr = SerializationUtilities.serializeExpression(en2);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // no stripe will satisfy the predicate
    assertEquals(0, splits.length);
    con = new ExprNodeConstantDesc(2);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    con1 = new ExprNodeConstantDesc(1.0);
    childExpr1.set(1, con1);
    en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
    childExpr2.set(0, en);
    childExpr2.set(1, en1);
    en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
    sargStr = SerializationUtilities.serializeExpression(en2);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // only first stripe will satisfy condition and hence single split
    assertEquals(1, splits.length);
    udf = new GenericUDFOPEqual();
    con = new ExprNodeConstantDesc(13);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    con1 = new ExprNodeConstantDesc(80.0);
    childExpr1.set(1, con1);
    en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
    childExpr2.set(0, en);
    childExpr2.set(1, en1);
    en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
    sargStr = SerializationUtilities.serializeExpression(en2);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // first two stripes will satisfy condition and hence single split
    assertEquals(2, splits.length);
    udf = new GenericUDFOPEqual();
    con = new ExprNodeConstantDesc(13);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    udf1 = new GenericUDFOPEqual();
    con1 = new ExprNodeConstantDesc(80.0);
    childExpr1.set(1, con1);
    en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
    childExpr2.set(0, en);
    childExpr2.set(1, en1);
    en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
    sargStr = SerializationUtilities.serializeExpression(en2);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // only second stripes will satisfy condition and hence single split
    assertEquals(1, splits.length);
}

Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) GenericUDFOPEqualOrLessThan(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) GenericUDFOPEqual(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) InputSplit(org.apache.hadoop.mapred.InputSplit) GenericUDFOPAnd(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd) Test(org.junit.Test)

Example 90 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestOrcSplitElimination method testSplitEliminationSmallMaxSplit.

@Test
public void testSplitEliminationSmallMaxSplit() throws Exception {
    ObjectInspector inspector = createIO();
    Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000);
    writeData(writer);
    writer.close();
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1000);
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 5000);
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    GenericUDF udf = new GenericUDFOPEqualOrLessThan();
    List<ExprNodeDesc> childExpr = Lists.newArrayList();
    ExprNodeConstantDesc con;
    ExprNodeGenericFuncDesc en;
    String sargStr;
    createTestSarg(inspector, udf, childExpr);
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(5, splits.length);
    con = new ExprNodeConstantDesc(1);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    assertEquals(0, splits.length);
    con = new ExprNodeConstantDesc(2);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    assertEquals(1, splits.length);
    con = new ExprNodeConstantDesc(5);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    assertEquals(2, splits.length);
    con = new ExprNodeConstantDesc(13);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    assertEquals(3, splits.length);
    con = new ExprNodeConstantDesc(29);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    assertEquals(4, splits.length);
    con = new ExprNodeConstantDesc(70);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    assertEquals(5, splits.length);
}

Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) GenericUDFOPEqualOrLessThan(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8