use of org.apache.hadoop.io.IntWritable in project hive by apache.
the class TestMultiOutputFormat method testMultiOutputFormatWithoutReduce.
/**
* A test job that reads a input file and outputs each word and the index of
* the word encountered to a text file and sequence file with different key
* values.
*/
@Test
public void testMultiOutputFormatWithoutReduce() throws Throwable {
Job job = new Job(mrConf, "MultiOutNoReduce");
job.setMapperClass(MultiOutWordIndexMapper.class);
job.setJarByClass(this.getClass());
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(MultiOutputFormat.class);
job.setNumReduceTasks(0);
JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);
configurer.addOutputFormat("out1", TextOutputFormat.class, IntWritable.class, Text.class);
configurer.addOutputFormat("out2", SequenceFileOutputFormat.class, Text.class, IntWritable.class);
Path outDir = new Path(workDir.getPath(), job.getJobName());
FileOutputFormat.setOutputPath(configurer.getJob("out1"), new Path(outDir, "out1"));
FileOutputFormat.setOutputPath(configurer.getJob("out2"), new Path(outDir, "out2"));
String fileContent = "Hello World";
String inputFile = createInputFile(fileContent);
FileInputFormat.setInputPaths(job, new Path(inputFile));
//Test for merging of configs
DistributedCache.addFileToClassPath(new Path(inputFile), job.getConfiguration(), fs);
String dummyFile = createInputFile("dummy file");
DistributedCache.addFileToClassPath(new Path(dummyFile), configurer.getJob("out1").getConfiguration(), fs);
// duplicate of the value. Merging should remove duplicates
DistributedCache.addFileToClassPath(new Path(inputFile), configurer.getJob("out2").getConfiguration(), fs);
configurer.configure();
// Verify if the configs are merged
Path[] fileClassPaths = DistributedCache.getFileClassPaths(job.getConfiguration());
List<Path> fileClassPathsList = Arrays.asList(fileClassPaths);
Assert.assertTrue("Cannot find " + (new Path(inputFile)) + " in " + fileClassPathsList, fileClassPathsList.contains(new Path(inputFile)));
Assert.assertTrue("Cannot find " + (new Path(dummyFile)) + " in " + fileClassPathsList, fileClassPathsList.contains(new Path(dummyFile)));
URI[] cacheFiles = DistributedCache.getCacheFiles(job.getConfiguration());
List<URI> cacheFilesList = Arrays.asList(cacheFiles);
URI inputFileURI = new Path(inputFile).makeQualified(fs).toUri();
Assert.assertTrue("Cannot find " + inputFileURI + " in " + cacheFilesList, cacheFilesList.contains(inputFileURI));
URI dummyFileURI = new Path(dummyFile).makeQualified(fs).toUri();
Assert.assertTrue("Cannot find " + dummyFileURI + " in " + cacheFilesList, cacheFilesList.contains(dummyFileURI));
Assert.assertTrue(job.waitForCompletion(true));
Path textOutPath = new Path(outDir, "out1/part-m-00000");
String[] textOutput = readFully(textOutPath).split("\n");
Path seqOutPath = new Path(outDir, "out2/part-m-00000");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, seqOutPath, mrConf);
Text key = new Text();
IntWritable value = new IntWritable();
String[] words = fileContent.split(" ");
Assert.assertEquals(words.length, textOutput.length);
LOG.info("Verifying file contents");
for (int i = 0; i < words.length; i++) {
Assert.assertEquals((i + 1) + "\t" + words[i], textOutput[i]);
reader.next(key, value);
Assert.assertEquals(words[i], key.toString());
Assert.assertEquals((i + 1), value.get());
}
Assert.assertFalse(reader.next(key, value));
}
use of org.apache.hadoop.io.IntWritable in project crunch by cloudera.
the class TupleWritablePartitionerTest method testGetPartition.
@Test
public void testGetPartition() {
IntWritable intWritable = new IntWritable(3);
TupleWritable key = new TupleWritable(new Writable[] { intWritable });
assertEquals(3, tupleWritableParitioner.getPartition(key, NullWritable.get(), 5));
assertEquals(1, tupleWritableParitioner.getPartition(key, NullWritable.get(), 2));
}
use of org.apache.hadoop.io.IntWritable in project learning-spark by databricks.
the class BasicLoadSequenceFile method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage BasicLoadSequenceFile [sparkMaster] [input]");
}
String master = args[0];
String fileName = args[1];
JavaSparkContext sc = new JavaSparkContext(master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaPairRDD<Text, IntWritable> input = sc.sequenceFile(fileName, Text.class, IntWritable.class);
JavaPairRDD<String, Integer> result = input.mapToPair(new ConvertToNativeTypes());
List<Tuple2<String, Integer>> resultList = result.collect();
for (Tuple2<String, Integer> record : resultList) {
System.out.println(record);
}
}
use of org.apache.hadoop.io.IntWritable in project learning-spark by databricks.
the class BasicSaveSequenceFile method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]");
}
String master = args[0];
String fileName = args[1];
JavaSparkContext sc = new JavaSparkContext(master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
List<Tuple2<String, Integer>> input = new ArrayList();
input.add(new Tuple2("coffee", 1));
input.add(new Tuple2("coffee", 2));
input.add(new Tuple2("pandas", 3));
JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
JavaPairRDD<Text, IntWritable> result = rdd.mapToPair(new ConvertToWritableTypes());
result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class);
}
use of org.apache.hadoop.io.IntWritable in project hadoop-book by elephantscale.
the class Reduce method reduce.
@Override
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
Aggregations