use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.
the class ClueWarcForwardIndexBuilder method run.
/**
* Runs this tool.
*/
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path (must be block-compressed SequenceFiles)").create(COLLECTION_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path").create(INDEX_OPTION));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
JobConf conf = new JobConf(getConf(), ClueWarcForwardIndexBuilder.class);
FileSystem fs = FileSystem.get(conf);
String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
String indexFile = cmdline.getOptionValue(INDEX_OPTION);
LOG.info("Tool name: " + ClueWarcForwardIndexBuilder.class.getSimpleName());
LOG.info(" - collection path: " + collectionPath);
LOG.info(" - index file: " + indexFile);
LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
Random random = new Random();
Path outputPath = new Path("tmp-" + ClueWarcForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000));
conf.setJobName(ClueWarcForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
conf.setNumMapTasks(100);
conf.setNumReduceTasks(1);
// thinks its a MapFile.
for (FileStatus status : fs.listStatus(new Path(collectionPath))) {
FileInputFormat.addInputPath(conf, status.getPath());
}
FileOutputFormat.setOutputPath(conf, outputPath);
FileOutputFormat.setCompressOutput(conf, false);
conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapRunnerClass(MyMapRunner.class);
conf.setReducerClass(IdentityReducer.class);
// delete the output directory if it exists already
fs.delete(outputPath, true);
RunningJob job = JobClient.runJob(conf);
Counters counters = job.getCounters();
int blocks = (int) counters.findCounter(Blocks.Total).getCounter();
LOG.info("number of blocks: " + blocks);
LOG.info("Writing index file...");
LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
FSDataOutputStream out = fs.create(new Path(indexFile), true);
out.writeUTF(ClueWarcForwardIndex.class.getCanonicalName());
out.writeUTF(collectionPath);
out.writeInt(blocks);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\s+");
int docno = Integer.parseInt(arr[0]);
int offset = Integer.parseInt(arr[1]);
short fileno = Short.parseShort(arr[2]);
out.writeInt(docno);
out.writeInt(offset);
out.writeShort(fileno);
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " blocks written");
}
}
reader.close();
out.close();
if (cnt != blocks) {
throw new RuntimeException("Error: mismatch in block count!");
}
fs.delete(outputPath, true);
return 0;
}
use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.
the class CountClueWarcRecords method run.
/**
* Runs this tool.
*/
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(new Option(ORIGINAL_OPTION, "use original ClueWeb09 distribution"));
options.addOption(new Option(REPACKED_OPTION, "use repacked SequenceFiles"));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("path: base path for 'original', actual path for 'repacked'").create(PATH_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("DocnoMapping data path").create(MAPPING_OPTION));
options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("segment number (required if 'original')").create(SEGMENT_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file to write the number of records").create(COUNT_OPTION));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
boolean repacked;
if (cmdline.hasOption(REPACKED_OPTION)) {
repacked = true;
} else if (cmdline.hasOption(ORIGINAL_OPTION)) {
repacked = false;
} else {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
System.err.println("Expecting either -original or -repacked");
return -1;
}
if (!cmdline.hasOption(PATH_OPTION) || !cmdline.hasOption(MAPPING_OPTION) || (!repacked && !cmdline.hasOption(SEGMENT_OPTION))) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
String path = cmdline.getOptionValue(PATH_OPTION);
String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
int segment = 1;
if (!repacked) {
segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT_OPTION));
}
LOG.info("Tool name: " + CountClueWarcRecords.class.getSimpleName());
LOG.info(" - repacked: " + repacked);
LOG.info(" - path: " + path);
LOG.info(" - mapping file: " + mappingFile);
if (!repacked) {
LOG.info(" - segment number: " + segment);
}
FileSystem fs = FileSystem.get(getConf());
int mapTasks = 10;
JobConf conf = new JobConf(getConf(), CountClueWarcRecords.class);
conf.setJobName(CountClueWarcRecords.class.getSimpleName() + (repacked ? ":" + path : ":segment" + segment));
conf.setNumMapTasks(mapTasks);
conf.setNumReduceTasks(0);
if (repacked) {
// thinks its a MapFile.
for (FileStatus status : fs.listStatus(new Path(path))) {
FileInputFormat.addInputPath(conf, status.getPath());
}
} else {
ClueCollectionPathConstants.addEnglishCollectionPart(conf, path, segment);
}
DistributedCache.addCacheFile(new URI(mappingFile), conf);
if (repacked) {
conf.setInputFormat(SequenceFileInputFormat.class);
} else {
conf.setInputFormat(ClueWarcInputFormat.class);
}
conf.setOutputFormat(NullOutputFormat.class);
conf.setMapperClass(MyMapper.class);
RunningJob job = JobClient.runJob(conf);
Counters counters = job.getCounters();
int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();
LOG.info("Read " + numDocs + " docs.");
if (cmdline.hasOption(COUNT_OPTION)) {
String f = cmdline.getOptionValue(COUNT_OPTION);
FSDataOutputStream out = fs.create(new Path(f));
out.write(new Integer(numDocs).toString().getBytes());
out.close();
}
return 0;
}
use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.
the class M1ViterbiExtract method main.
@SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException {
int mapTasks = 15;
JobConf conf = new JobConf(M1ViterbiMapper.class);
conf.setJobName("m1viterbi");
conf.setOutputKeyClass(LongWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(M1ViterbiMapper.class);
conf.setNumMapTasks(mapTasks);
conf.setNumReduceTasks(0);
conf.setInputFormat(SequenceFileInputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(bitext));
FileOutputFormat.setOutputPath(conf, new Path("somealigns.test"));
RunningJob rj = JobClient.runJob(conf);
Counters cs = rj.getCounters();
double lp = (double) cs.getCounter(CrossEntropyCounters.LOGPROB);
double wc = (double) cs.getCounter(CrossEntropyCounters.WORDCOUNT);
double ce = (lp / wc) / Math.log(2.0);
System.out.println("Viterbi cross-entropy: " + ce + " perplexity: " + Math.pow(2.0, ce));
}
Aggregations