Search in sources :

Example 1 with JobConfigurer

use of org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer in project hive by apache.

the class TestHCatMultiOutputFormat method testOutputFormat.

/**
 * Simple test case.
 * <ol>
 * <li>Submits a mapred job which writes out one fixed line to each of the tables</li>
 * <li>uses hive fetch task to read the data and see if it matches what was written</li>
 * </ol>
 *
 * @throws Exception if any error occurs
 */
@Test
public void testOutputFormat() throws Throwable {
    HashMap<String, String> partitionValues = new HashMap<String, String>();
    partitionValues.put("ds", "1");
    partitionValues.put("cluster", "ag");
    ArrayList<OutputJobInfo> infoList = new ArrayList<OutputJobInfo>();
    infoList.add(OutputJobInfo.create("default", tableNames[0], partitionValues));
    infoList.add(OutputJobInfo.create("default", tableNames[1], partitionValues));
    infoList.add(OutputJobInfo.create("default", tableNames[2], partitionValues));
    Job job = new Job(hiveConf, "SampleJob");
    job.setMapperClass(MyMapper.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(MultiOutputFormat.class);
    job.setNumReduceTasks(0);
    JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);
    for (int i = 0; i < tableNames.length; i++) {
        configurer.addOutputFormat(tableNames[i], HCatOutputFormat.class, BytesWritable.class, HCatRecord.class);
        HCatOutputFormat.setOutput(configurer.getJob(tableNames[i]), infoList.get(i));
        HCatOutputFormat.setSchema(configurer.getJob(tableNames[i]), schemaMap.get(tableNames[i]));
    }
    configurer.configure();
    Path filePath = createInputFile();
    FileInputFormat.addInputPath(job, filePath);
    Assert.assertTrue(job.waitForCompletion(true));
    ArrayList<String> outputs = new ArrayList<String>();
    for (String tbl : tableNames) {
        outputs.add(getTableData(tbl, "default").get(0));
    }
    Assert.assertEquals("Comparing output of table " + tableNames[0] + " is not correct", outputs.get(0), "a,a,1,ag");
    Assert.assertEquals("Comparing output of table " + tableNames[1] + " is not correct", outputs.get(1), "a,1,ag");
    Assert.assertEquals("Comparing output of table " + tableNames[2] + " is not correct", outputs.get(2), "a,a,extra,1,ag");
    // Check permisssion on partition dirs and files created
    for (int i = 0; i < tableNames.length; i++) {
        Path partitionFile = new Path(warehousedir + "/" + tableNames[i] + "/ds=1/cluster=ag/part-m-00000");
        FileSystem fs = partitionFile.getFileSystem(mrConf);
        Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile).getPermission(), new FsPermission(tablePerms[i]));
        Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile.getParent()).getPermission(), new FsPermission(tablePerms[i]));
        Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile.getParent().getParent()).getPermission(), new FsPermission(tablePerms[i]));
    }
    LOG.info("File permissions verified");
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) JobConfigurer(org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer) FileSystem(org.apache.hadoop.fs.FileSystem) FsPermission(org.apache.hadoop.fs.permission.FsPermission) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Example 2 with JobConfigurer

use of org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer in project hive by apache.

the class TestMultiOutputFormat method testMultiOutputFormatWithoutReduce.

/**
 * A test job that reads a input file and outputs each word and the index of
 * the word encountered to a text file and sequence file with different key
 * values.
 */
@Test
public void testMultiOutputFormatWithoutReduce() throws Throwable {
    Job job = new Job(mrConf, "MultiOutNoReduce");
    job.setMapperClass(MultiOutWordIndexMapper.class);
    job.setJarByClass(this.getClass());
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(MultiOutputFormat.class);
    job.setNumReduceTasks(0);
    JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);
    configurer.addOutputFormat("out1", TextOutputFormat.class, IntWritable.class, Text.class);
    configurer.addOutputFormat("out2", SequenceFileOutputFormat.class, Text.class, IntWritable.class);
    Path outDir = new Path(workDir.getPath(), job.getJobName());
    FileOutputFormat.setOutputPath(configurer.getJob("out1"), new Path(outDir, "out1"));
    FileOutputFormat.setOutputPath(configurer.getJob("out2"), new Path(outDir, "out2"));
    String fileContent = "Hello World";
    String inputFile = createInputFile(fileContent);
    FileInputFormat.setInputPaths(job, new Path(inputFile));
    // Test for merging of configs
    DistributedCache.addFileToClassPath(new Path(inputFile), job.getConfiguration(), fs);
    String dummyFile = createInputFile("dummy file");
    DistributedCache.addFileToClassPath(new Path(dummyFile), configurer.getJob("out1").getConfiguration(), fs);
    // duplicate of the value. Merging should remove duplicates
    DistributedCache.addFileToClassPath(new Path(inputFile), configurer.getJob("out2").getConfiguration(), fs);
    configurer.configure();
    // Verify if the configs are merged
    Path[] fileClassPaths = DistributedCache.getFileClassPaths(job.getConfiguration());
    List<Path> fileClassPathsList = Arrays.asList(fileClassPaths);
    Assert.assertTrue("Cannot find " + (new Path(inputFile)) + " in " + fileClassPathsList, fileClassPathsList.contains(new Path(inputFile)));
    Assert.assertTrue("Cannot find " + (new Path(dummyFile)) + " in " + fileClassPathsList, fileClassPathsList.contains(new Path(dummyFile)));
    URI[] cacheFiles = DistributedCache.getCacheFiles(job.getConfiguration());
    List<URI> cacheFilesList = Arrays.asList(cacheFiles);
    URI inputFileURI = new Path(inputFile).makeQualified(fs).toUri();
    Assert.assertTrue("Cannot find " + inputFileURI + " in " + cacheFilesList, cacheFilesList.contains(inputFileURI));
    URI dummyFileURI = new Path(dummyFile).makeQualified(fs).toUri();
    Assert.assertTrue("Cannot find " + dummyFileURI + " in " + cacheFilesList, cacheFilesList.contains(dummyFileURI));
    Assert.assertTrue(job.waitForCompletion(true));
    Path textOutPath = new Path(outDir, "out1/part-m-00000");
    String[] textOutput = readFully(textOutPath).split("\n");
    Path seqOutPath = new Path(outDir, "out2/part-m-00000");
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, seqOutPath, mrConf);
    Text key = new Text();
    IntWritable value = new IntWritable();
    String[] words = fileContent.split(" ");
    Assert.assertEquals(words.length, textOutput.length);
    LOG.info("Verifying file contents");
    for (int i = 0; i < words.length; i++) {
        Assert.assertEquals((i + 1) + "\t" + words[i], textOutput[i]);
        reader.next(key, value);
        Assert.assertEquals(words[i], key.toString());
        Assert.assertEquals((i + 1), value.get());
    }
    Assert.assertFalse(reader.next(key, value));
}
Also used : Path(org.apache.hadoop.fs.Path) JobConfigurer(org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer) Text(org.apache.hadoop.io.Text) URI(java.net.URI) SequenceFile(org.apache.hadoop.io.SequenceFile) Job(org.apache.hadoop.mapreduce.Job) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 3 with JobConfigurer

use of org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer in project hive by apache.

the class TestMultiOutputFormat method testMultiOutputFormatWithReduce.

/**
 * A word count test job that reads a input file and outputs the count of
 * words to a text file and sequence file with different key values.
 */
@Test
public void testMultiOutputFormatWithReduce() throws Throwable {
    Job job = new Job(mrConf, "MultiOutWithReduce");
    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(MultiOutWordCountReducer.class);
    job.setJarByClass(this.getClass());
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(MultiOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);
    configurer.addOutputFormat("out1", TextOutputFormat.class, IntWritable.class, Text.class);
    configurer.addOutputFormat("out2", SequenceFileOutputFormat.class, Text.class, IntWritable.class);
    configurer.addOutputFormat("out3", NullOutputFormat.class, Text.class, IntWritable.class);
    Path outDir = new Path(workDir.getPath(), job.getJobName());
    FileOutputFormat.setOutputPath(configurer.getJob("out1"), new Path(outDir, "out1"));
    FileOutputFormat.setOutputPath(configurer.getJob("out2"), new Path(outDir, "out2"));
    configurer.configure();
    String fileContent = "Hello World Hello World World";
    String inputFile = createInputFile(fileContent);
    FileInputFormat.setInputPaths(job, new Path(inputFile));
    Assert.assertTrue(job.waitForCompletion(true));
    Path textOutPath = new Path(outDir, "out1/part-r-00000");
    String[] textOutput = readFully(textOutPath).split("\n");
    Path seqOutPath = new Path(outDir, "out2/part-r-00000");
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, seqOutPath, mrConf);
    Text key = new Text();
    IntWritable value = new IntWritable();
    String[] words = "Hello World".split(" ");
    Assert.assertEquals(words.length, textOutput.length);
    for (int i = 0; i < words.length; i++) {
        Assert.assertEquals((i + 2) + "\t" + words[i], textOutput[i]);
        reader.next(key, value);
        Assert.assertEquals(words[i], key.toString());
        Assert.assertEquals((i + 2), value.get());
    }
    Assert.assertFalse(reader.next(key, value));
}
Also used : Path(org.apache.hadoop.fs.Path) SequenceFile(org.apache.hadoop.io.SequenceFile) JobConfigurer(org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer) Text(org.apache.hadoop.io.Text) Job(org.apache.hadoop.mapreduce.Job) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Aggregations

Path (org.apache.hadoop.fs.Path)3 Job (org.apache.hadoop.mapreduce.Job)3 JobConfigurer (org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer)3 Test (org.junit.Test)3 IntWritable (org.apache.hadoop.io.IntWritable)2 SequenceFile (org.apache.hadoop.io.SequenceFile)2 Text (org.apache.hadoop.io.Text)2 URI (java.net.URI)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 FsPermission (org.apache.hadoop.fs.permission.FsPermission)1