use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.
the class ClueWarcForwardIndexBuilder method run.
/**
* Runs this tool.
*/
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path (must be block-compressed SequenceFiles)").create(COLLECTION_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path").create(INDEX_OPTION));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
JobConf conf = new JobConf(getConf(), ClueWarcForwardIndexBuilder.class);
FileSystem fs = FileSystem.get(conf);
String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
String indexFile = cmdline.getOptionValue(INDEX_OPTION);
LOG.info("Tool name: " + ClueWarcForwardIndexBuilder.class.getSimpleName());
LOG.info(" - collection path: " + collectionPath);
LOG.info(" - index file: " + indexFile);
LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
Random random = new Random();
Path outputPath = new Path("tmp-" + ClueWarcForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000));
conf.setJobName(ClueWarcForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
conf.setNumMapTasks(100);
conf.setNumReduceTasks(1);
// thinks its a MapFile.
for (FileStatus status : fs.listStatus(new Path(collectionPath))) {
FileInputFormat.addInputPath(conf, status.getPath());
}
FileOutputFormat.setOutputPath(conf, outputPath);
FileOutputFormat.setCompressOutput(conf, false);
conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapRunnerClass(MyMapRunner.class);
conf.setReducerClass(IdentityReducer.class);
// delete the output directory if it exists already
fs.delete(outputPath, true);
RunningJob job = JobClient.runJob(conf);
Counters counters = job.getCounters();
int blocks = (int) counters.findCounter(Blocks.Total).getCounter();
LOG.info("number of blocks: " + blocks);
LOG.info("Writing index file...");
LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
FSDataOutputStream out = fs.create(new Path(indexFile), true);
out.writeUTF(ClueWarcForwardIndex.class.getCanonicalName());
out.writeUTF(collectionPath);
out.writeInt(blocks);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\s+");
int docno = Integer.parseInt(arr[0]);
int offset = Integer.parseInt(arr[1]);
short fileno = Short.parseShort(arr[2]);
out.writeInt(docno);
out.writeInt(offset);
out.writeShort(fileno);
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " blocks written");
}
}
reader.close();
out.close();
if (cnt != blocks) {
throw new RuntimeException("Error: mismatch in block count!");
}
fs.delete(outputPath, true);
return 0;
}
use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.
the class Aquaint2DocnoMapping method writeDocnoData.
public static void writeDocnoData(Path input, Path output, FileSystem fs) throws IOException {
LOG.info("Writing docno data to " + output);
LineReader reader = new LineReader(fs.open(input));
List<String> list = Lists.newArrayList();
LOG.info("Reading " + input);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\t");
list.add(arr[0]);
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " docs");
}
}
reader.close();
LOG.info(cnt + " docs total. Done!");
cnt = 0;
LOG.info("Writing " + output);
FSDataOutputStream out = fs.create(output, true);
out.writeInt(list.size());
for (int i = 0; i < list.size(); i++) {
out.writeUTF(list.get(i));
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " docs");
}
}
out.close();
LOG.info(cnt + " docs total. Done!");
}
use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.
the class Aquaint2ForwardIndexBuilder method runTool.
public int runTool(Configuration config, String collectionPath, String outputPath, String indexFile, String mappingFile) throws Exception {
//sLogger.error ("getConf(): " + getConf() + ", DemoCountAquaint2Documents.class: " + DemoCountAquaint2Documents.class);
JobConf conf = new JobConf(config, DemoCountAquaint2Documents.class);
FileSystem fs = FileSystem.get(config);
sLogger.info("Tool name: BuildAquaint2ForwardIndex");
sLogger.info(" - collection path: " + collectionPath);
sLogger.info(" - output path: " + outputPath);
sLogger.info(" - index file: " + indexFile);
sLogger.info(" - mapping file: " + mappingFile);
conf.setJobName("BuildAquaint2ForwardIndex");
conf.set("mapred.child.java.opts", "-Xmx1024m");
conf.setNumReduceTasks(1);
if (conf.get("mapred.job.tracker").equals("local")) {
conf.set("DocnoMappingFile", mappingFile);
} else {
DistributedCache.addCacheFile(new URI(mappingFile), conf);
}
FileInputFormat.setInputPaths(conf, new Path(collectionPath));
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
FileOutputFormat.setCompressOutput(conf, false);
conf.setInputFormat(Aquaint2DocumentInputFormatOld.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(MyMapper.class);
conf.setReducerClass(IdentityReducer.class);
// delete the output directory if it exists already
FileSystem.get(conf).delete(new Path(outputPath), true);
RunningJob job = JobClient.runJob(conf);
Counters counters = job.getCounters();
int numDocs = (int) counters.findCounter(Count.DOCS).getCounter();
String inputFile = outputPath + "/" + "part-00000";
sLogger.info("Writing " + numDocs + " doc offseta to " + indexFile);
LineReader reader = new LineReader(fs.open(new Path(inputFile)));
FSDataOutputStream writer = fs.create(new Path(indexFile), true);
writer.writeUTF("edu.umd.cloud9.collection.aquaint2.Aquaint2ForwardIndex");
writer.writeUTF(collectionPath);
writer.writeInt(numDocs);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\t");
long offset = Long.parseLong(arr[1]);
int len = Integer.parseInt(arr[2]);
// sLogger.info(arr[0] + " " + offset + " " + len);
writer.writeLong(offset);
writer.writeInt(len);
cnt++;
if (cnt % 100000 == 0) {
sLogger.info(cnt + " docs");
}
}
reader.close();
writer.close();
sLogger.info(cnt + " docs total. Done!");
if (numDocs != cnt) {
throw new RuntimeException("Unexpected number of documents in building forward index!");
}
return 0;
}
use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.
the class TrecWebDocnoMappingBuilder method writeMappingData.
private static void writeMappingData(Path input, Path output, FileSystem fs) throws IOException {
LOG.info("Writing docids to " + output);
LineReader reader = new LineReader(fs.open(input));
LOG.info("Reading " + input);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
cnt++;
}
reader.close();
LOG.info("Done!");
LOG.info("Writing " + output);
FSDataOutputStream out = fs.create(output, true);
reader = new LineReader(fs.open(input));
out.writeInt(cnt);
cnt = 0;
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\t");
out.writeUTF(arr[0]);
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " documents");
}
}
reader.close();
out.close();
LOG.info("Done! " + cnt + " documents total.");
}
use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.
the class MedlineDocnoMapping method writeMappingData.
/**
* Creates a mappings file from the contents of a flat text file containing docid to docno
* mappings. This method is used by {@link MedlineDocnoMappingBuilder} internally.
*
* @param input flat text file containing docid to docno mappings
* @param output output mappings file
* @param fs reference to the file system
* @throws IOException
*/
public static void writeMappingData(Path input, Path output, FileSystem fs) throws IOException {
Preconditions.checkNotNull(input);
Preconditions.checkNotNull(output);
Preconditions.checkNotNull(fs);
LOG.info("Writing docids to " + output);
LineReader reader = new LineReader(fs.open(input));
List<Integer> list = Lists.newArrayList();
LOG.info("Reading " + input);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\t");
list.add(Integer.parseInt(arr[0]));
cnt++;
if (cnt % 500000 == 0) {
LOG.info(cnt);
}
}
reader.close();
LOG.info("Done! Total of " + cnt + " docids read.");
cnt = 0;
LOG.info("Writing " + output);
FSDataOutputStream out = fs.create(output, true);
out.writeInt(list.size());
for (int i = 0; i < list.size(); i++) {
out.writeInt(list.get(i));
cnt++;
if (cnt % 500000 == 0) {
LOG.info(cnt);
}
}
out.close();
LOG.info("Done! Total of " + cnt + " docids written.");
}
Aggregations