Search in sources :

Example 1 with SequenceFileInputFormat

use of org.apache.hadoop.mapred.SequenceFileInputFormat in project hadoop by apache.

the class DataJoinJob method createDataJoinJob.

public static JobConf createDataJoinJob(String[] args) throws IOException {
    String inputDir = args[0];
    String outputDir = args[1];
    Class inputFormat = SequenceFileInputFormat.class;
    if (args[2].compareToIgnoreCase("text") != 0) {
        System.out.println("Using SequenceFileInputFormat: " + args[2]);
    } else {
        System.out.println("Using TextInputFormat: " + args[2]);
        inputFormat = TextInputFormat.class;
    }
    int numOfReducers = Integer.parseInt(args[3]);
    Class mapper = getClassByName(args[4]);
    Class reducer = getClassByName(args[5]);
    Class mapoutputValueClass = getClassByName(args[6]);
    Class outputFormat = TextOutputFormat.class;
    Class outputValueClass = Text.class;
    if (args[7].compareToIgnoreCase("text") != 0) {
        System.out.println("Using SequenceFileOutputFormat: " + args[7]);
        outputFormat = SequenceFileOutputFormat.class;
        outputValueClass = getClassByName(args[7]);
    } else {
        System.out.println("Using TextOutputFormat: " + args[7]);
    }
    long maxNumOfValuesPerGroup = 100;
    String jobName = "";
    if (args.length > 8) {
        maxNumOfValuesPerGroup = Long.parseLong(args[8]);
    }
    if (args.length > 9) {
        jobName = args[9];
    }
    Configuration defaults = new Configuration();
    JobConf job = new JobConf(defaults, DataJoinJob.class);
    job.setJobName("DataJoinJob: " + jobName);
    FileSystem fs = FileSystem.get(defaults);
    fs.delete(new Path(outputDir), true);
    FileInputFormat.setInputPaths(job, inputDir);
    job.setInputFormat(inputFormat);
    job.setMapperClass(mapper);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormat(outputFormat);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(mapoutputValueClass);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(outputValueClass);
    job.setReducerClass(reducer);
    job.setNumMapTasks(1);
    job.setNumReduceTasks(numOfReducers);
    job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup);
    return job;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) TextOutputFormat(org.apache.hadoop.mapred.TextOutputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) Text(org.apache.hadoop.io.Text) JobConf(org.apache.hadoop.mapred.JobConf)

Example 2 with SequenceFileInputFormat

use of org.apache.hadoop.mapred.SequenceFileInputFormat in project tez by apache.

the class TestMultiMRInput method testSingleSplit.

@Test(timeout = 5000)
public void testSingleSplit() throws Exception {
    Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);
    InputContext inputContext = createTezInputContext(jobConf);
    MultiMRInput input = new MultiMRInput(inputContext, 1);
    input.initialize();
    AtomicLong inputLength = new AtomicLong();
    LinkedHashMap<LongWritable, Text> data = createSplits(1, workDir, jobConf, inputLength);
    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 1);
    assertEquals(1, splits.length);
    MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer());
    List<Event> eventList = new ArrayList<Event>();
    eventList.add(event);
    input.handleEvents(eventList);
    assertReaders(input, data, 1, inputLength.get());
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) InputContext(org.apache.tez.runtime.api.InputContext) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) AtomicLong(java.util.concurrent.atomic.AtomicLong) Event(org.apache.tez.runtime.api.Event) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) Test(org.junit.Test)

Example 3 with SequenceFileInputFormat

use of org.apache.hadoop.mapred.SequenceFileInputFormat in project tez by apache.

the class TestMultiMRInput method testMultipleSplits.

@Test(timeout = 5000)
public void testMultipleSplits() throws Exception {
    Path workDir = new Path(TEST_ROOT_DIR, "testMultipleSplits");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);
    InputContext inputContext = createTezInputContext(jobConf);
    MultiMRInput input = new MultiMRInput(inputContext, 2);
    input.initialize();
    AtomicLong inputLength = new AtomicLong();
    LinkedHashMap<LongWritable, Text> data = createSplits(2, workDir, jobConf, inputLength);
    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 2);
    assertEquals(2, splits.length);
    MRSplitProto splitProto1 = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto1.toByteString().asReadOnlyByteBuffer());
    MRSplitProto splitProto2 = MRInputHelpers.createSplitProto(splits[1]);
    InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto2.toByteString().asReadOnlyByteBuffer());
    List<Event> eventList = new ArrayList<Event>();
    eventList.add(event1);
    eventList.add(event2);
    input.handleEvents(eventList);
    assertReaders(input, data, 2, inputLength.get());
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) InputContext(org.apache.tez.runtime.api.InputContext) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) AtomicLong(java.util.concurrent.atomic.AtomicLong) Event(org.apache.tez.runtime.api.Event) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) Test(org.junit.Test)

Example 4 with SequenceFileInputFormat

use of org.apache.hadoop.mapred.SequenceFileInputFormat in project tez by apache.

the class MapUtils method createInputSplit.

private static InputSplit createInputSplit(FileSystem fs, Path workDir, JobConf job, Path file, int numKVs) throws IOException {
    FileInputFormat.setInputPaths(job, workDir);
    LOG.info("Generating data at path: " + file);
    // create a file with length entries
    @SuppressWarnings("deprecation") SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, file, LongWritable.class, Text.class);
    try {
        Random r = new Random(System.currentTimeMillis());
        LongWritable key = new LongWritable();
        Text value = new Text();
        for (int i = numKVs; i > 0; i--) {
            key.set(r.nextInt(1000));
            value.set(Integer.toString(i));
            writer.append(key, value);
            LOG.info("<k, v> : <" + key.get() + ", " + value + ">");
        }
    } finally {
        writer.close();
    }
    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(job, 1);
    System.err.println("#split = " + splits.length + " ; " + "#locs = " + splits[0].getLocations().length + "; " + "loc = " + splits[0].getLocations()[0] + "; " + "off = " + splits[0].getLength() + "; " + "file = " + ((FileSplit) splits[0]).getPath());
    return splits[0];
}
Also used : SequenceFile(org.apache.hadoop.io.SequenceFile) Random(java.util.Random) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 5 with SequenceFileInputFormat

use of org.apache.hadoop.mapred.SequenceFileInputFormat in project tez by apache.

the class TestMultiMRInput method testExtraEvents.

@Test(timeout = 5000)
public void testExtraEvents() throws Exception {
    Path workDir = new Path(TEST_ROOT_DIR, "testExtraEvents");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);
    InputContext inputContext = createTezInputContext(jobConf);
    MultiMRInput input = new MultiMRInput(inputContext, 1);
    input.initialize();
    createSplits(1, workDir, jobConf, new AtomicLong());
    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 1);
    assertEquals(1, splits.length);
    MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer());
    InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(1, splitProto.toByteString().asReadOnlyByteBuffer());
    List<Event> eventList = new ArrayList<Event>();
    eventList.add(event1);
    eventList.add(event2);
    try {
        input.handleEvents(eventList);
        fail("Expecting Exception due to too many events");
    } catch (Exception e) {
        assertTrue(e.getMessage().contains("Unexpected event. All physical sources already initialized"));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) InputContext(org.apache.tez.runtime.api.InputContext) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) AtomicLong(java.util.concurrent.atomic.AtomicLong) Event(org.apache.tez.runtime.api.Event) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MRSplitProto(org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) Test(org.junit.Test)

Aggregations

Text (org.apache.hadoop.io.Text)5 SequenceFileInputFormat (org.apache.hadoop.mapred.SequenceFileInputFormat)5 Path (org.apache.hadoop.fs.Path)4 LongWritable (org.apache.hadoop.io.LongWritable)4 InputSplit (org.apache.hadoop.mapred.InputSplit)4 JobConf (org.apache.hadoop.mapred.JobConf)4 ArrayList (java.util.ArrayList)3 AtomicLong (java.util.concurrent.atomic.AtomicLong)3 MRSplitProto (org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto)3 Event (org.apache.tez.runtime.api.Event)3 InputContext (org.apache.tez.runtime.api.InputContext)3 InputDataInformationEvent (org.apache.tez.runtime.api.events.InputDataInformationEvent)3 Test (org.junit.Test)3 IOException (java.io.IOException)1 Random (java.util.Random)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1 TextOutputFormat (org.apache.hadoop.mapred.TextOutputFormat)1