Search in sources :

Example 31 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project incubator-systemml by apache.

the class DelegatingInputFormat method getRecordReader.

@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException {
    // Find the InputFormat and then the RecordReader from the
    // TaggedInputSplit.
    TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
    InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils.newInstance(taggedInputSplit.getInputFormatClass(), conf);
    InputSplit inputSplit = taggedInputSplit.getInputSplit();
    if (inputSplit instanceof FileSplit) {
        FileSplit fileSplit = (FileSplit) inputSplit;
        conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString());
        conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart());
        conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength());
    }
    return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter);
}
Also used : FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 32 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project incubator-systemml by apache.

the class DelegatingInputFormat method getSplits.

public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    JobConf confCopy = new JobConf(conf);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(conf);
    Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(conf);
    Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>();
    // First, build a map of InputFormats to Paths
    for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
        if (!formatPaths.containsKey(entry.getValue().getClass())) {
            formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
        }
        formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
    }
    for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
        Class<? extends InputFormat> formatClass = formatEntry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        List<Path> paths = formatEntry.getValue();
        Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<Class<? extends Mapper>, List<Path>>();
        // a map of Mappers to the paths they're used for
        for (Path path : paths) {
            Class<? extends Mapper> mapperClass = mapperMap.get(path);
            if (!mapperPaths.containsKey(mapperClass)) {
                mapperPaths.put(mapperClass, new LinkedList<Path>());
            }
            mapperPaths.get(mapperClass).add(path);
        }
        // be added to the same job, and split together.
        for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
            paths = mapEntry.getValue();
            Class<? extends Mapper> mapperClass = mapEntry.getKey();
            if (mapperClass == null) {
                mapperClass = conf.getMapperClass();
            }
            FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths.size()]));
            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            InputSplit[] pathSplits = format.getSplits(confCopy, numSplits);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
            }
        }
    }
    return splits.toArray(new InputSplit[splits.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Mapper(org.apache.hadoop.mapred.Mapper) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) ArrayList(java.util.ArrayList) List(java.util.List) LinkedList(java.util.LinkedList) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 33 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project incubator-systemml by apache.

the class MultipleInputs method getInputFormatMap.

/**
   * Retrieves a map of {@link Path}s to the {@link InputFormat} class
   * that should be used for them.
   * 
   * @param conf The confuration of the job
   * @see #addInputPath(JobConf, Path, Class)
   * @return A map of paths to inputformats for the job
   */
static Map<Path, InputFormat> getInputFormatMap(JobConf conf) {
    Map<Path, InputFormat> m = new HashMap<Path, InputFormat>();
    String[] pathMappings = conf.get(MRConfigurationNames.MR_INPUT_MULTIPLEINPUTS_DIR_FORMATS).split(",");
    for (String pathMapping : pathMappings) {
        String[] split = pathMapping.split(";");
        InputFormat inputFormat;
        try {
            inputFormat = (InputFormat) ReflectionUtils.newInstance(conf.getClassByName(split[1]), conf);
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }
        m.put(new Path(split[0]), inputFormat);
    }
    return m;
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) InputFormat(org.apache.hadoop.mapred.InputFormat)

Example 34 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project ignite by apache.

the class HadoopV1MapTask method run.

/** {@inheritDoc} */
@SuppressWarnings("unchecked")
@Override
public void run(HadoopTaskContext taskCtx) throws IgniteCheckedException {
    HadoopJobEx job = taskCtx.job();
    HadoopV2TaskContext taskCtx0 = (HadoopV2TaskContext) taskCtx;
    if (taskCtx.taskInfo().hasMapperIndex())
        HadoopMapperUtils.mapperIndex(taskCtx.taskInfo().mapperIndex());
    else
        HadoopMapperUtils.clearMapperIndex();
    try {
        JobConf jobConf = taskCtx0.jobConf();
        InputFormat inFormat = jobConf.getInputFormat();
        HadoopInputSplit split = info().inputSplit();
        InputSplit nativeSplit;
        if (split instanceof HadoopFileBlock) {
            HadoopFileBlock block = (HadoopFileBlock) split;
            nativeSplit = new FileSplit(new Path(block.file().toString()), block.start(), block.length(), EMPTY_HOSTS);
        } else
            nativeSplit = (InputSplit) taskCtx0.getNativeSplit(split);
        assert nativeSplit != null;
        Reporter reporter = new HadoopV1Reporter(taskCtx);
        HadoopV1OutputCollector collector = null;
        try {
            collector = collector(jobConf, taskCtx0, !job.info().hasCombiner() && !job.info().hasReducer(), fileName(), taskCtx0.attemptId());
            RecordReader reader = inFormat.getRecordReader(nativeSplit, jobConf, reporter);
            Mapper mapper = ReflectionUtils.newInstance(jobConf.getMapperClass(), jobConf);
            Object key = reader.createKey();
            Object val = reader.createValue();
            assert mapper != null;
            try {
                try {
                    while (reader.next(key, val)) {
                        if (isCancelled())
                            throw new HadoopTaskCancelledException("Map task cancelled.");
                        mapper.map(key, val, collector, reporter);
                    }
                    taskCtx.onMapperFinished();
                } finally {
                    mapper.close();
                }
            } finally {
                collector.closeWriter();
            }
            collector.commit();
        } catch (Exception e) {
            if (collector != null)
                collector.abort();
            throw new IgniteCheckedException(e);
        }
    } finally {
        HadoopMapperUtils.clearMapperIndex();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Reporter(org.apache.hadoop.mapred.Reporter) RecordReader(org.apache.hadoop.mapred.RecordReader) HadoopInputSplit(org.apache.ignite.hadoop.HadoopInputSplit) HadoopFileBlock(org.apache.ignite.internal.processors.hadoop.HadoopFileBlock) FileSplit(org.apache.hadoop.mapred.FileSplit) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) HadoopTaskCancelledException(org.apache.ignite.internal.processors.hadoop.HadoopTaskCancelledException) Mapper(org.apache.hadoop.mapred.Mapper) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) HadoopJobEx(org.apache.ignite.internal.processors.hadoop.HadoopJobEx) InputFormat(org.apache.hadoop.mapred.InputFormat) HadoopTaskCancelledException(org.apache.ignite.internal.processors.hadoop.HadoopTaskCancelledException) JobConf(org.apache.hadoop.mapred.JobConf) HadoopInputSplit(org.apache.ignite.hadoop.HadoopInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) HadoopV2TaskContext(org.apache.ignite.internal.processors.hadoop.impl.v2.HadoopV2TaskContext)

Aggregations

InputFormat (org.apache.hadoop.mapred.InputFormat)34 Path (org.apache.hadoop.fs.Path)23 JobConf (org.apache.hadoop.mapred.JobConf)20 InputSplit (org.apache.hadoop.mapred.InputSplit)19 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)15 IOException (java.io.IOException)11 FileSystem (org.apache.hadoop.fs.FileSystem)8 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 Configuration (org.apache.hadoop.conf.Configuration)6 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)6 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)6 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)5 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)5 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 Mapper (org.apache.hadoop.mapred.Mapper)5 Test (org.junit.Test)5