use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.
the class ClueWarcForwardIndexBuilder method run.
/**
* Runs this tool.
*/
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path (must be block-compressed SequenceFiles)").create(COLLECTION_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path").create(INDEX_OPTION));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
JobConf conf = new JobConf(getConf(), ClueWarcForwardIndexBuilder.class);
FileSystem fs = FileSystem.get(conf);
String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
String indexFile = cmdline.getOptionValue(INDEX_OPTION);
LOG.info("Tool name: " + ClueWarcForwardIndexBuilder.class.getSimpleName());
LOG.info(" - collection path: " + collectionPath);
LOG.info(" - index file: " + indexFile);
LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
Random random = new Random();
Path outputPath = new Path("tmp-" + ClueWarcForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000));
conf.setJobName(ClueWarcForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
conf.setNumMapTasks(100);
conf.setNumReduceTasks(1);
// thinks its a MapFile.
for (FileStatus status : fs.listStatus(new Path(collectionPath))) {
FileInputFormat.addInputPath(conf, status.getPath());
}
FileOutputFormat.setOutputPath(conf, outputPath);
FileOutputFormat.setCompressOutput(conf, false);
conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapRunnerClass(MyMapRunner.class);
conf.setReducerClass(IdentityReducer.class);
// delete the output directory if it exists already
fs.delete(outputPath, true);
RunningJob job = JobClient.runJob(conf);
Counters counters = job.getCounters();
int blocks = (int) counters.findCounter(Blocks.Total).getCounter();
LOG.info("number of blocks: " + blocks);
LOG.info("Writing index file...");
LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
FSDataOutputStream out = fs.create(new Path(indexFile), true);
out.writeUTF(ClueWarcForwardIndex.class.getCanonicalName());
out.writeUTF(collectionPath);
out.writeInt(blocks);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\s+");
int docno = Integer.parseInt(arr[0]);
int offset = Integer.parseInt(arr[1]);
short fileno = Short.parseShort(arr[2]);
out.writeInt(docno);
out.writeInt(offset);
out.writeShort(fileno);
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " blocks written");
}
}
reader.close();
out.close();
if (cnt != blocks) {
throw new RuntimeException("Error: mismatch in block count!");
}
fs.delete(outputPath, true);
return 0;
}
use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.
the class CountClueWarcRecords method run.
/**
* Runs this tool.
*/
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(new Option(ORIGINAL_OPTION, "use original ClueWeb09 distribution"));
options.addOption(new Option(REPACKED_OPTION, "use repacked SequenceFiles"));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("path: base path for 'original', actual path for 'repacked'").create(PATH_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("DocnoMapping data path").create(MAPPING_OPTION));
options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("segment number (required if 'original')").create(SEGMENT_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file to write the number of records").create(COUNT_OPTION));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
boolean repacked;
if (cmdline.hasOption(REPACKED_OPTION)) {
repacked = true;
} else if (cmdline.hasOption(ORIGINAL_OPTION)) {
repacked = false;
} else {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
System.err.println("Expecting either -original or -repacked");
return -1;
}
if (!cmdline.hasOption(PATH_OPTION) || !cmdline.hasOption(MAPPING_OPTION) || (!repacked && !cmdline.hasOption(SEGMENT_OPTION))) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
String path = cmdline.getOptionValue(PATH_OPTION);
String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
int segment = 1;
if (!repacked) {
segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT_OPTION));
}
LOG.info("Tool name: " + CountClueWarcRecords.class.getSimpleName());
LOG.info(" - repacked: " + repacked);
LOG.info(" - path: " + path);
LOG.info(" - mapping file: " + mappingFile);
if (!repacked) {
LOG.info(" - segment number: " + segment);
}
FileSystem fs = FileSystem.get(getConf());
int mapTasks = 10;
JobConf conf = new JobConf(getConf(), CountClueWarcRecords.class);
conf.setJobName(CountClueWarcRecords.class.getSimpleName() + (repacked ? ":" + path : ":segment" + segment));
conf.setNumMapTasks(mapTasks);
conf.setNumReduceTasks(0);
if (repacked) {
// thinks its a MapFile.
for (FileStatus status : fs.listStatus(new Path(path))) {
FileInputFormat.addInputPath(conf, status.getPath());
}
} else {
ClueCollectionPathConstants.addEnglishCollectionPart(conf, path, segment);
}
DistributedCache.addCacheFile(new URI(mappingFile), conf);
if (repacked) {
conf.setInputFormat(SequenceFileInputFormat.class);
} else {
conf.setInputFormat(ClueWarcInputFormat.class);
}
conf.setOutputFormat(NullOutputFormat.class);
conf.setMapperClass(MyMapper.class);
RunningJob job = JobClient.runJob(conf);
Counters counters = job.getCounters();
int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();
LOG.info("Read " + numDocs + " docs.");
if (cmdline.hasOption(COUNT_OPTION)) {
String f = cmdline.getOptionValue(COUNT_OPTION);
FSDataOutputStream out = fs.create(new Path(f));
out.write(new Integer(numDocs).toString().getBytes());
out.close();
}
return 0;
}
use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.
the class Aquaint2ForwardIndexBuilder method runTool.
public int runTool(Configuration config, String collectionPath, String outputPath, String indexFile, String mappingFile) throws Exception {
//sLogger.error ("getConf(): " + getConf() + ", DemoCountAquaint2Documents.class: " + DemoCountAquaint2Documents.class);
JobConf conf = new JobConf(config, DemoCountAquaint2Documents.class);
FileSystem fs = FileSystem.get(config);
sLogger.info("Tool name: BuildAquaint2ForwardIndex");
sLogger.info(" - collection path: " + collectionPath);
sLogger.info(" - output path: " + outputPath);
sLogger.info(" - index file: " + indexFile);
sLogger.info(" - mapping file: " + mappingFile);
conf.setJobName("BuildAquaint2ForwardIndex");
conf.set("mapred.child.java.opts", "-Xmx1024m");
conf.setNumReduceTasks(1);
if (conf.get("mapred.job.tracker").equals("local")) {
conf.set("DocnoMappingFile", mappingFile);
} else {
DistributedCache.addCacheFile(new URI(mappingFile), conf);
}
FileInputFormat.setInputPaths(conf, new Path(collectionPath));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
FileOutputFormat.setCompressOutput(conf, false);
conf.setInputFormat(Aquaint2DocumentInputFormatOld.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(MyMapper.class);
conf.setReducerClass(IdentityReducer.class);
// delete the output directory if it exists already
FileSystem.get(conf).delete(new Path(outputPath), true);
RunningJob job = JobClient.runJob(conf);
Counters counters = job.getCounters();
int numDocs = (int) counters.findCounter(Count.DOCS).getCounter();
String inputFile = outputPath + "/" + "part-00000";
sLogger.info("Writing " + numDocs + " doc offseta to " + indexFile);
LineReader reader = new LineReader(fs.open(new Path(inputFile)));
FSDataOutputStream writer = fs.create(new Path(indexFile), true);
writer.writeUTF("edu.umd.cloud9.collection.aquaint2.Aquaint2ForwardIndex");
writer.writeUTF(collectionPath);
writer.writeInt(numDocs);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\t");
long offset = Long.parseLong(arr[1]);
int len = Integer.parseInt(arr[2]);
// sLogger.info(arr[0] + " " + offset + " " + len);
writer.writeLong(offset);
writer.writeInt(len);
cnt++;
if (cnt % 100000 == 0) {
sLogger.info(cnt + " docs");
}
}
reader.close();
writer.close();
sLogger.info(cnt + " docs total. Done!");
if (numDocs != cnt) {
throw new RuntimeException("Unexpected number of documents in building forward index!");
}
return 0;
}
use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.
the class WikipediaForwardIndexBuilder method run.
@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg().withDescription("two-letter language code").create(LANGUAGE_OPTION));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);
String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);
if (!inputPath.isAbsolute()) {
System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
return -1;
}
String language = null;
if (cmdline.hasOption(LANGUAGE_OPTION)) {
language = cmdline.getOptionValue(LANGUAGE_OPTION);
if (language.length() != 2) {
System.err.println("Error: \"" + language + "\" unknown language!");
return -1;
}
}
JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
FileSystem fs = FileSystem.get(conf);
LOG.info("Tool name: " + this.getClass().getName());
LOG.info(" - input path: " + inputPath);
LOG.info(" - index file: " + indexFile);
LOG.info(" - language: " + language);
LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));
conf.setNumReduceTasks(1);
FileInputFormat.setInputPaths(conf, inputPath);
FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
FileOutputFormat.setCompressOutput(conf, false);
if (language != null) {
conf.set("wiki.language", language);
}
conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapRunnerClass(MyMapRunner.class);
conf.setReducerClass(IdentityReducer.class);
// Delete the output directory if it exists already.
fs.delete(new Path(tmpPath), true);
RunningJob job = JobClient.runJob(conf);
Counters counters = job.getCounters();
int blocks = (int) counters.getCounter(Blocks.Total);
LOG.info("number of blocks: " + blocks);
LOG.info("Writing index file...");
LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
FSDataOutputStream out = fs.create(new Path(indexFile), true);
out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
out.writeUTF(inputPath.toString());
out.writeInt(blocks);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\s+");
int docno = Integer.parseInt(arr[0]);
int offset = Integer.parseInt(arr[1]);
short fileno = Short.parseShort(arr[2]);
out.writeInt(docno);
out.writeInt(offset);
out.writeShort(fileno);
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " blocks written");
}
}
reader.close();
out.close();
if (cnt != blocks) {
throw new RuntimeException("Error: mismatch in block count!");
}
// Clean up.
fs.delete(new Path(tmpPath), true);
return 0;
}
use of org.apache.hadoop.mapred.RunningJob in project hadoop by apache.
the class TestMultithreadedMapRunner method run.
private void run(boolean ioEx, boolean rtEx) throws Exception {
Path inDir = new Path("testing/mt/input");
Path outDir = new Path("testing/mt/output");
// Hack for local FS that does not have the concept of a 'mounting point'
if (isLocalFS()) {
String localPathRoot = System.getProperty("test.build.data", "/tmp").replace(' ', '+');
inDir = new Path(localPathRoot, inDir);
outDir = new Path(localPathRoot, outDir);
}
JobConf conf = createJobConf();
FileSystem fs = FileSystem.get(conf);
fs.delete(outDir, true);
if (!fs.mkdirs(inDir)) {
throw new IOException("Mkdirs failed to create " + inDir.toString());
}
{
DataOutputStream file = fs.create(new Path(inDir, "part-0"));
file.writeBytes("a\nb\n\nc\nd\ne");
file.close();
}
conf.setJobName("mt");
conf.setInputFormat(TextInputFormat.class);
conf.setOutputKeyClass(LongWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapOutputKeyClass(LongWritable.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputFormat(TextOutputFormat.class);
conf.setOutputKeyClass(LongWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(IDMap.class);
conf.setReducerClass(IDReduce.class);
FileInputFormat.setInputPaths(conf, inDir);
FileOutputFormat.setOutputPath(conf, outDir);
conf.setMapRunnerClass(MultithreadedMapRunner.class);
conf.setInt(MultithreadedMapper.NUM_THREADS, 2);
if (ioEx) {
conf.setBoolean("multithreaded.ioException", true);
}
if (rtEx) {
conf.setBoolean("multithreaded.runtimeException", true);
}
JobClient jc = new JobClient(conf);
RunningJob job = jc.submitJob(conf);
while (!job.isComplete()) {
Thread.sleep(100);
}
if (job.isSuccessful()) {
assertFalse(ioEx || rtEx);
} else {
assertTrue(ioEx || rtEx);
}
}
Aggregations