Examples with InputFormat - org.apache.hadoop.mapred.InputFormat

Example 1 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hadoop by apache.

the class Submitter method run.

@Override
public int run(String[] args) throws Exception {
    CommandLineParser cli = new CommandLineParser();
    if (args.length == 0) {
        cli.printUsage();
        return 1;
    }
    cli.addOption("input", false, "input path to the maps", "path");
    cli.addOption("output", false, "output path from the reduces", "path");
    cli.addOption("jar", false, "job jar file", "path");
    cli.addOption("inputformat", false, "java classname of InputFormat", "class");
    //cli.addArgument("javareader", false, "is the RecordReader in Java");
    cli.addOption("map", false, "java classname of Mapper", "class");
    cli.addOption("partitioner", false, "java classname of Partitioner", "class");
    cli.addOption("reduce", false, "java classname of Reducer", "class");
    cli.addOption("writer", false, "java classname of OutputFormat", "class");
    cli.addOption("program", false, "URI to application executable", "class");
    cli.addOption("reduces", false, "number of reduces", "num");
    cli.addOption("jobconf", false, "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val");
    cli.addOption("lazyOutput", false, "Optional. Create output lazily", "boolean");
    Parser parser = cli.createParser();
    try {
        GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args);
        CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs());
        JobConf job = new JobConf(getConf());
        if (results.hasOption("input")) {
            FileInputFormat.setInputPaths(job, results.getOptionValue("input"));
        }
        if (results.hasOption("output")) {
            FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output")));
        }
        if (results.hasOption("jar")) {
            job.setJar(results.getOptionValue("jar"));
        }
        if (results.hasOption("inputformat")) {
            setIsJavaRecordReader(job, true);
            job.setInputFormat(getClass(results, "inputformat", job, InputFormat.class));
        }
        if (results.hasOption("javareader")) {
            setIsJavaRecordReader(job, true);
        }
        if (results.hasOption("map")) {
            setIsJavaMapper(job, true);
            job.setMapperClass(getClass(results, "map", job, Mapper.class));
        }
        if (results.hasOption("partitioner")) {
            job.setPartitionerClass(getClass(results, "partitioner", job, Partitioner.class));
        }
        if (results.hasOption("reduce")) {
            setIsJavaReducer(job, true);
            job.setReducerClass(getClass(results, "reduce", job, Reducer.class));
        }
        if (results.hasOption("reduces")) {
            job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces")));
        }
        if (results.hasOption("writer")) {
            setIsJavaRecordWriter(job, true);
            job.setOutputFormat(getClass(results, "writer", job, OutputFormat.class));
        }
        if (results.hasOption("lazyOutput")) {
            if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
                LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormat().getClass());
            }
        }
        if (results.hasOption("program")) {
            setExecutable(job, results.getOptionValue("program"));
        }
        if (results.hasOption("jobconf")) {
            LOG.warn("-jobconf option is deprecated, please use -D instead.");
            String options = results.getOptionValue("jobconf");
            StringTokenizer tokenizer = new StringTokenizer(options, ",");
            while (tokenizer.hasMoreTokens()) {
                String keyVal = tokenizer.nextToken().trim();
                String[] keyValSplit = keyVal.split("=");
                job.set(keyValSplit[0], keyValSplit[1]);
            }
        }
        // if they gave us a jar file, include it into the class path
        String jarFile = job.getJar();
        if (jarFile != null) {
            final URL[] urls = new URL[] { FileSystem.getLocal(job).pathToFile(new Path(jarFile)).toURL() };
            //FindBugs complains that creating a URLClassLoader should be
            //in a doPrivileged() block. 
            ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {

                public ClassLoader run() {
                    return new URLClassLoader(urls);
                }
            });
            job.setClassLoader(loader);
        }
        runJob(job);
        return 0;
    } catch (ParseException pe) {
        LOG.info("Error : " + pe);
        cli.printUsage();
        return 1;
    }
}

Also used : Path(org.apache.hadoop.fs.Path) NullOutputFormat(org.apache.hadoop.mapred.lib.NullOutputFormat) OutputFormat(org.apache.hadoop.mapred.OutputFormat) LazyOutputFormat(org.apache.hadoop.mapred.lib.LazyOutputFormat) FileOutputFormat(org.apache.hadoop.mapred.FileOutputFormat) URL(java.net.URL) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser) BasicParser(org.apache.commons.cli.BasicParser) Parser(org.apache.commons.cli.Parser) Mapper(org.apache.hadoop.mapred.Mapper) CommandLine(org.apache.commons.cli.CommandLine) StringTokenizer(java.util.StringTokenizer) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) URLClassLoader(java.net.URLClassLoader) URLClassLoader(java.net.URLClassLoader) ParseException(org.apache.commons.cli.ParseException) Reducer(org.apache.hadoop.mapred.Reducer) JobConf(org.apache.hadoop.mapred.JobConf) HashPartitioner(org.apache.hadoop.mapred.lib.HashPartitioner) Partitioner(org.apache.hadoop.mapred.Partitioner) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 2 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hadoop by apache.

the class MultipleInputs method getInputFormatMap.

/**
   * Retrieves a map of {@link Path}s to the {@link InputFormat} class
   * that should be used for them.
   * 
   * @param conf The confuration of the job
   * @see #addInputPath(JobConf, Path, Class)
   * @return A map of paths to inputformats for the job
   */
static Map<Path, InputFormat> getInputFormatMap(JobConf conf) {
    Map<Path, InputFormat> m = new HashMap<Path, InputFormat>();
    String[] pathMappings = conf.get("mapreduce.input.multipleinputs.dir.formats").split(",");
    for (String pathMapping : pathMappings) {
        String[] split = pathMapping.split(";");
        InputFormat inputFormat;
        try {
            inputFormat = (InputFormat) ReflectionUtils.newInstance(conf.getClassByName(split[1]), conf);
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }
        m.put(new Path(split[0]), inputFormat);
    }
    return m;
}

Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) InputFormat(org.apache.hadoop.mapred.InputFormat)

Example 3 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hadoop by apache.

the class TestMultipleInputs method testAddInputPathWithMapper.

@Test
public void testAddInputPathWithMapper() {
    final JobConf conf = new JobConf();
    MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class, MapClass.class);
    MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class, MapClass2.class);
    final Map<Path, InputFormat> inputs = MultipleInputs.getInputFormatMap(conf);
    final Map<Path, Class<? extends Mapper>> maps = MultipleInputs.getMapperTypeMap(conf);
    assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
    assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")).getClass());
    assertEquals(MapClass.class, maps.get(new Path("/foo")));
    assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}

Also used : Path(org.apache.hadoop.fs.Path) Mapper(org.apache.hadoop.mapred.Mapper) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) KeyValueTextInputFormat(org.apache.hadoop.mapred.KeyValueTextInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 4 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class TestOrcRawRecordMerger method testRecordReaderIncompleteDelta.

/**
   * 
   * @param use130Format true means use delta_0001_0001_0000 format, else delta_0001_00001
   */
private void testRecordReaderIncompleteDelta(boolean use130Format) throws Exception {
    final int BUCKET = 1;
    Configuration conf = new Configuration();
    OrcOutputFormat of = new OrcOutputFormat();
    FileSystem fs = FileSystem.getLocal(conf).getRaw();
    Path root = new Path(tmpDir, "testRecordReaderIncompleteDelta").makeQualified(fs);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // write a base
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).writingBase(true).minimumTransactionId(0).maximumTransactionId(0).bucket(BUCKET).inspector(inspector).filesystem(fs).finalDestination(root);
    if (!use130Format) {
        options.statementId(-1);
    }
    RecordUpdater ru = of.getRecordUpdater(root, options);
    String[] values = new String[] { "1", "2", "3", "4", "5" };
    for (int i = 0; i < values.length; ++i) {
        ru.insert(0, new MyRow(values[i]));
    }
    ru.close(false);
    // write a delta
    options.writingBase(false).minimumTransactionId(10).maximumTransactionId(19);
    ru = of.getRecordUpdater(root, options);
    values = new String[] { "6", "7", "8" };
    for (int i = 0; i < values.length; ++i) {
        ru.insert(1, new MyRow(values[i]));
    }
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", root.toString());
    job.set("bucket_count", "2");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
    HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
    // read the keys before the delta is flushed
    InputSplit[] splits = inf.getSplits(job, 1);
    assertEquals(2, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    NullWritable key = rr.createKey();
    OrcStruct value = rr.createValue();
    System.out.println("Looking at split " + splits[0]);
    for (int i = 1; i < 6; ++i) {
        System.out.println("Checking row " + i);
        assertEquals(true, rr.next(key, value));
        assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
    }
    assertEquals(false, rr.next(key, value));
    ru.flush();
    ru.flush();
    values = new String[] { "9", "10" };
    for (int i = 0; i < values.length; ++i) {
        ru.insert(3, new MyRow(values[i]));
    }
    ru.flush();
    splits = inf.getSplits(job, 1);
    assertEquals(2, splits.length);
    rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    Path sideFile = new Path(root + "/" + (use130Format ? AcidUtils.deltaSubdir(10, 19, 0) : AcidUtils.deltaSubdir(10, 19)) + "/bucket_00001_flush_length");
    assertEquals(true, fs.exists(sideFile));
    assertEquals(24, fs.getFileStatus(sideFile).getLen());
    for (int i = 1; i < 11; ++i) {
        assertEquals(true, rr.next(key, value));
        assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
    }
    assertEquals(false, rr.next(key, value));
}

Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Configuration(org.apache.hadoop.conf.Configuration) NullWritable(org.apache.hadoop.io.NullWritable) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 5 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class TestOrcRawRecordMerger method testRecordReaderNewBaseAndDelta.

/**
   * Test the RecordReader when there is a new base and a delta.
   * @throws Exception
   */
@Test
public void testRecordReaderNewBaseAndDelta() throws Exception {
    final int BUCKET = 11;
    Configuration conf = new Configuration();
    OrcOutputFormat of = new OrcOutputFormat();
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testRecordReaderNewBaseAndDelta").makeQualified(fs);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // write the base
    MemoryManager mgr = new MemoryManager(conf) {

        int rowsAddedSinceCheck = 0;

        @Override
        public synchronized void addedRow(int rows) throws IOException {
            rowsAddedSinceCheck += rows;
            if (rowsAddedSinceCheck >= 2) {
                notifyWriters();
                rowsAddedSinceCheck = 0;
            }
        }
    };
    // make 5 stripes with 2 rows each
    OrcRecordUpdater.OrcOptions options = (OrcRecordUpdater.OrcOptions) new OrcRecordUpdater.OrcOptions(conf).writingBase(true).minimumTransactionId(0).maximumTransactionId(0).bucket(BUCKET).inspector(inspector).filesystem(fs);
    options.orcOptions(OrcFile.writerOptions(conf).stripeSize(1).blockPadding(false).compress(CompressionKind.NONE).memory(mgr).batchSize(2));
    options.finalDestination(root);
    RecordUpdater ru = of.getRecordUpdater(root, options);
    String[] values = new String[] { "ignore.1", "0.1", "ignore.2", "ignore.3", "2.0", "2.1", "3.0", "ignore.4", "ignore.5", "ignore.6" };
    for (int i = 0; i < values.length; ++i) {
        ru.insert(0, new BigRow(i, i, values[i], i, i));
    }
    ru.close(false);
    // write a delta
    options.writingBase(false).minimumTransactionId(1).maximumTransactionId(1).recordIdColumn(5);
    ru = of.getRecordUpdater(root, options);
    values = new String[] { "0.0", null, null, "1.1", null, null, null, "ignore.7" };
    for (int i = 0; i < values.length; ++i) {
        if (values[i] != null) {
            ru.update(1, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
        }
    }
    ru.delete(100, new BigRow(9, 0, BUCKET));
    ru.close(false);
    // write a delta
    options.minimumTransactionId(2).maximumTransactionId(2);
    ru = of.getRecordUpdater(root, options);
    values = new String[] { null, null, "1.0", null, null, null, null, "3.1" };
    for (int i = 0; i < values.length; ++i) {
        if (values[i] != null) {
            ru.update(2, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
        }
    }
    ru.delete(100, new BigRow(8, 0, BUCKET));
    ru.close(false);
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
    HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(5, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
    // loop through the 5 splits and read each
    for (int i = 0; i < 4; ++i) {
        System.out.println("starting split " + i + " = " + splits[i]);
        rr = inf.getRecordReader(splits[i], job, Reporter.NULL);
        NullWritable key = rr.createKey();
        OrcStruct value = rr.createValue();
        // there should be exactly two rows per a split
        for (int j = 0; j < 2; ++j) {
            System.out.println("i = " + i + ", j = " + j);
            assertEquals(true, rr.next(key, value));
            System.out.println("record = " + value);
            assertEquals(i + "." + j, value.getFieldValue(2).toString());
        }
        assertEquals(false, rr.next(key, value));
    }
    rr = inf.getRecordReader(splits[4], job, Reporter.NULL);
    assertEquals(false, rr.next(rr.createKey(), rr.createValue()));
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) MemoryManager(org.apache.orc.impl.MemoryManager) NullWritable(org.apache.hadoop.io.NullWritable) InputFormat(org.apache.hadoop.mapred.InputFormat) Test(org.junit.Test)

Aggregations

InputFormat (org.apache.hadoop.mapred.InputFormat)34 Path (org.apache.hadoop.fs.Path)23 JobConf (org.apache.hadoop.mapred.JobConf)20 InputSplit (org.apache.hadoop.mapred.InputSplit)19 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)15 IOException (java.io.IOException)11 FileSystem (org.apache.hadoop.fs.FileSystem)8 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 Configuration (org.apache.hadoop.conf.Configuration)6 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)6 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)6 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)5 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)5 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 Mapper (org.apache.hadoop.mapred.Mapper)5 Test (org.junit.Test)5