use of org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer in project hive by apache.
the class TestHCatMultiOutputFormat method testOutputFormat.
/**
* Simple test case.
* <ol>
* <li>Submits a mapred job which writes out one fixed line to each of the tables</li>
* <li>uses hive fetch task to read the data and see if it matches what was written</li>
* </ol>
*
* @throws Exception if any error occurs
*/
@Test
public void testOutputFormat() throws Throwable {
HashMap<String, String> partitionValues = new HashMap<String, String>();
partitionValues.put("ds", "1");
partitionValues.put("cluster", "ag");
ArrayList<OutputJobInfo> infoList = new ArrayList<OutputJobInfo>();
infoList.add(OutputJobInfo.create("default", tableNames[0], partitionValues));
infoList.add(OutputJobInfo.create("default", tableNames[1], partitionValues));
infoList.add(OutputJobInfo.create("default", tableNames[2], partitionValues));
Job job = new Job(hiveConf, "SampleJob");
job.setMapperClass(MyMapper.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(MultiOutputFormat.class);
job.setNumReduceTasks(0);
JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);
for (int i = 0; i < tableNames.length; i++) {
configurer.addOutputFormat(tableNames[i], HCatOutputFormat.class, BytesWritable.class, HCatRecord.class);
HCatOutputFormat.setOutput(configurer.getJob(tableNames[i]), infoList.get(i));
HCatOutputFormat.setSchema(configurer.getJob(tableNames[i]), schemaMap.get(tableNames[i]));
}
configurer.configure();
Path filePath = createInputFile();
FileInputFormat.addInputPath(job, filePath);
Assert.assertTrue(job.waitForCompletion(true));
ArrayList<String> outputs = new ArrayList<String>();
for (String tbl : tableNames) {
outputs.add(getTableData(tbl, "default").get(0));
}
Assert.assertEquals("Comparing output of table " + tableNames[0] + " is not correct", outputs.get(0), "a,a,1,ag");
Assert.assertEquals("Comparing output of table " + tableNames[1] + " is not correct", outputs.get(1), "a,1,ag");
Assert.assertEquals("Comparing output of table " + tableNames[2] + " is not correct", outputs.get(2), "a,a,extra,1,ag");
// Check permisssion on partition dirs and files created
for (int i = 0; i < tableNames.length; i++) {
Path partitionFile = new Path(warehousedir + "/" + tableNames[i] + "/ds=1/cluster=ag/part-m-00000");
FileSystem fs = partitionFile.getFileSystem(mrConf);
Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile).getPermission(), new FsPermission(tablePerms[i]));
Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile.getParent()).getPermission(), new FsPermission(tablePerms[i]));
Assert.assertEquals("File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile.getParent().getParent()).getPermission(), new FsPermission(tablePerms[i]));
}
LOG.info("File permissions verified");
}
use of org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer in project hive by apache.
the class TestMultiOutputFormat method testMultiOutputFormatWithoutReduce.
/**
* A test job that reads a input file and outputs each word and the index of
* the word encountered to a text file and sequence file with different key
* values.
*/
@Test
public void testMultiOutputFormatWithoutReduce() throws Throwable {
Job job = new Job(mrConf, "MultiOutNoReduce");
job.setMapperClass(MultiOutWordIndexMapper.class);
job.setJarByClass(this.getClass());
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(MultiOutputFormat.class);
job.setNumReduceTasks(0);
JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);
configurer.addOutputFormat("out1", TextOutputFormat.class, IntWritable.class, Text.class);
configurer.addOutputFormat("out2", SequenceFileOutputFormat.class, Text.class, IntWritable.class);
Path outDir = new Path(workDir.getPath(), job.getJobName());
FileOutputFormat.setOutputPath(configurer.getJob("out1"), new Path(outDir, "out1"));
FileOutputFormat.setOutputPath(configurer.getJob("out2"), new Path(outDir, "out2"));
String fileContent = "Hello World";
String inputFile = createInputFile(fileContent);
FileInputFormat.setInputPaths(job, new Path(inputFile));
// Test for merging of configs
DistributedCache.addFileToClassPath(new Path(inputFile), job.getConfiguration(), fs);
String dummyFile = createInputFile("dummy file");
DistributedCache.addFileToClassPath(new Path(dummyFile), configurer.getJob("out1").getConfiguration(), fs);
// duplicate of the value. Merging should remove duplicates
DistributedCache.addFileToClassPath(new Path(inputFile), configurer.getJob("out2").getConfiguration(), fs);
configurer.configure();
// Verify if the configs are merged
Path[] fileClassPaths = DistributedCache.getFileClassPaths(job.getConfiguration());
List<Path> fileClassPathsList = Arrays.asList(fileClassPaths);
Assert.assertTrue("Cannot find " + (new Path(inputFile)) + " in " + fileClassPathsList, fileClassPathsList.contains(new Path(inputFile)));
Assert.assertTrue("Cannot find " + (new Path(dummyFile)) + " in " + fileClassPathsList, fileClassPathsList.contains(new Path(dummyFile)));
URI[] cacheFiles = DistributedCache.getCacheFiles(job.getConfiguration());
List<URI> cacheFilesList = Arrays.asList(cacheFiles);
URI inputFileURI = new Path(inputFile).makeQualified(fs).toUri();
Assert.assertTrue("Cannot find " + inputFileURI + " in " + cacheFilesList, cacheFilesList.contains(inputFileURI));
URI dummyFileURI = new Path(dummyFile).makeQualified(fs).toUri();
Assert.assertTrue("Cannot find " + dummyFileURI + " in " + cacheFilesList, cacheFilesList.contains(dummyFileURI));
Assert.assertTrue(job.waitForCompletion(true));
Path textOutPath = new Path(outDir, "out1/part-m-00000");
String[] textOutput = readFully(textOutPath).split("\n");
Path seqOutPath = new Path(outDir, "out2/part-m-00000");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, seqOutPath, mrConf);
Text key = new Text();
IntWritable value = new IntWritable();
String[] words = fileContent.split(" ");
Assert.assertEquals(words.length, textOutput.length);
LOG.info("Verifying file contents");
for (int i = 0; i < words.length; i++) {
Assert.assertEquals((i + 1) + "\t" + words[i], textOutput[i]);
reader.next(key, value);
Assert.assertEquals(words[i], key.toString());
Assert.assertEquals((i + 1), value.get());
}
Assert.assertFalse(reader.next(key, value));
}
use of org.apache.hive.hcatalog.mapreduce.MultiOutputFormat.JobConfigurer in project hive by apache.
the class TestMultiOutputFormat method testMultiOutputFormatWithReduce.
/**
* A word count test job that reads a input file and outputs the count of
* words to a text file and sequence file with different key values.
*/
@Test
public void testMultiOutputFormatWithReduce() throws Throwable {
Job job = new Job(mrConf, "MultiOutWithReduce");
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(MultiOutWordCountReducer.class);
job.setJarByClass(this.getClass());
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(MultiOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);
configurer.addOutputFormat("out1", TextOutputFormat.class, IntWritable.class, Text.class);
configurer.addOutputFormat("out2", SequenceFileOutputFormat.class, Text.class, IntWritable.class);
configurer.addOutputFormat("out3", NullOutputFormat.class, Text.class, IntWritable.class);
Path outDir = new Path(workDir.getPath(), job.getJobName());
FileOutputFormat.setOutputPath(configurer.getJob("out1"), new Path(outDir, "out1"));
FileOutputFormat.setOutputPath(configurer.getJob("out2"), new Path(outDir, "out2"));
configurer.configure();
String fileContent = "Hello World Hello World World";
String inputFile = createInputFile(fileContent);
FileInputFormat.setInputPaths(job, new Path(inputFile));
Assert.assertTrue(job.waitForCompletion(true));
Path textOutPath = new Path(outDir, "out1/part-r-00000");
String[] textOutput = readFully(textOutPath).split("\n");
Path seqOutPath = new Path(outDir, "out2/part-r-00000");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, seqOutPath, mrConf);
Text key = new Text();
IntWritable value = new IntWritable();
String[] words = "Hello World".split(" ");
Assert.assertEquals(words.length, textOutput.length);
for (int i = 0; i < words.length; i++) {
Assert.assertEquals((i + 2) + "\t" + words[i], textOutput[i]);
reader.next(key, value);
Assert.assertEquals(words[i], key.toString());
Assert.assertEquals((i + 2), value.get());
}
Assert.assertFalse(reader.next(key, value));
}
Aggregations