use of org.apache.hadoop.mapred.lib.HashPartitioner in project Cloud9 by lintool.
the class HubsAndAuthoritiesSchimmy method HACalc.
public int HACalc(String path, int iter, int jter, int nodeCount, boolean useCombiner, boolean useInmapCombiner, boolean useRange, int mapTasks, int reduceTasks) throws IOException {
JobConf conf = new JobConf(HubsAndAuthoritiesSchimmy.class);
String inputPath = path + "/iter" + sFormat.format(iter);
String outputPath = path + "/iter" + sFormat.format(jter) + "t";
FileSystem fs = FileSystem.get(conf);
// int numPartitions = FileSystem.get(conf).listStatus(new
// Path(inputPath)).length - 1;
// we need to actually count the number of part files to get the number
// of partitions (because the directory might contain _log)
int numPartitions = 0;
for (FileStatus s : FileSystem.get(conf).listStatus(new Path(inputPath))) {
if (s.getPath().getName().contains("part-"))
numPartitions++;
}
conf.setInt("NodeCount", nodeCount);
Partitioner p = null;
if (useRange) {
p = new RangePartitioner<IntWritable, Writable>();
p.configure(conf);
} else {
p = new HashPartitioner<WritableComparable, Writable>();
}
// this is really annoying: the mapping between the partition numbers on
// disk (i.e., part-XXXX) and what partition the file contains (i.e.,
// key.hash % #reducer) is arbitrary... so this means that we need to
// open up each partition, peek inside to find out.
IntWritable key = new IntWritable();
HITSNode value = new HITSNode();
FileStatus[] status = fs.listStatus(new Path(inputPath));
StringBuilder sb = new StringBuilder();
for (FileStatus f : status) {
if (f.getPath().getName().contains("_logs"))
continue;
SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf);
reader.next(key, value);
@SuppressWarnings("unchecked") int np = p.getPartition(key, value, numPartitions);
reader.close();
sLogger.info(f.getPath() + "\t" + np);
sb.append(np + "=" + f.getPath() + "\t");
}
sLogger.info(sb.toString().trim());
sLogger.info("Tool: HubsAndAuthorities");
sLogger.info(" - iteration: " + iter);
sLogger.info(" - number of mappers: " + mapTasks);
sLogger.info(" - number of reducers: " + reduceTasks);
conf.setJobName("Iter" + iter + "HubsAndAuthorities");
conf.setNumMapTasks(mapTasks);
conf.setNumReduceTasks(reduceTasks);
FileInputFormat.setInputPaths(conf, new Path(inputPath));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
FileOutputFormat.setCompressOutput(conf, false);
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(HITSNode.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
if (useInmapCombiner == true) {
conf.setMapperClass(HAMapperIMC.class);
} else {
conf.setMapperClass(HAMapper.class);
}
if (useRange == true) {
conf.setPartitionerClass(RangePartitioner.class);
}
conf.setReducerClass(HAReducer.class);
conf.setInt("jobIter", iter);
conf.setInt("NodeCount", nodeCount);
conf.set("PartitionMapping", sb.toString().trim());
// Delete the output directory if it exists already
Path outputDir = new Path(outputPath);
FileSystem.get(conf).delete(outputDir, true);
long startTime = System.currentTimeMillis();
JobClient.runJob(conf);
sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
return 0;
}
Aggregations