use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.
the class TrecDocnoMapping method writeMappingData.
/**
* Creates a mappings file from the contents of a flat text file containing docid to docno
* mappings. This method is used by {@link TrecDocnoMappingBuilder} internally.
*
* @param input flat text file containing docid to docno mappings
* @param output output mappings data file
* @param fs {@code FileSystem} to write to
* @throws IOException
*/
public static void writeMappingData(Path input, Path output, FileSystem fs) throws IOException {
Preconditions.checkNotNull(input);
Preconditions.checkNotNull(output);
Preconditions.checkNotNull(fs);
LOG.info("Writing docno data to " + output);
LineReader reader = new LineReader(fs.open(input));
List<String> list = Lists.newArrayList();
LOG.info("Reading " + input);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\t");
list.add(arr[0]);
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " docs");
}
}
reader.close();
LOG.info(cnt + " docs total. Done!");
cnt = 0;
LOG.info("Writing " + output);
FSDataOutputStream out = fs.create(output, true);
out.writeInt(list.size());
for (int i = 0; i < list.size(); i++) {
out.writeUTF(list.get(i));
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " docs");
}
}
out.close();
LOG.info(cnt + " docs total. Done!");
}
use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.
the class WikipediaDocnoMapping method writeDocnoMappingData.
/**
* Creates a mappings file from the contents of a flat text file containing docid to docno
* mappings. This method is used by {@link WikipediaDocnoMappingBuilder} internally.
*
* @param inputFile flat text file containing docid to docno mappings
* @param outputFile output mappings file
* @throws IOException
*/
public static void writeDocnoMappingData(FileSystem fs, String inputFile, int n, String outputFile) throws IOException {
LOG.info("Writing " + n + " docids to " + outputFile);
LineReader reader = new LineReader(fs.open(new Path(inputFile)));
int cnt = 0;
Text line = new Text();
FSDataOutputStream out = fs.create(new Path(outputFile), true);
out.writeInt(n);
for (int i = 0; i < n; i++) {
reader.readLine(line);
String[] arr = line.toString().split("\\t");
out.writeInt(Integer.parseInt(arr[0]));
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " articles");
}
}
out.close();
LOG.info("Done!");
}
use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.
the class WikipediaForwardIndexBuilder method run.
@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg().withDescription("two-letter language code").create(LANGUAGE_OPTION));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);
String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);
if (!inputPath.isAbsolute()) {
System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
return -1;
}
String language = null;
if (cmdline.hasOption(LANGUAGE_OPTION)) {
language = cmdline.getOptionValue(LANGUAGE_OPTION);
if (language.length() != 2) {
System.err.println("Error: \"" + language + "\" unknown language!");
return -1;
}
}
JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
FileSystem fs = FileSystem.get(conf);
LOG.info("Tool name: " + this.getClass().getName());
LOG.info(" - input path: " + inputPath);
LOG.info(" - index file: " + indexFile);
LOG.info(" - language: " + language);
LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));
conf.setNumReduceTasks(1);
FileInputFormat.setInputPaths(conf, inputPath);
FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
FileOutputFormat.setCompressOutput(conf, false);
if (language != null) {
conf.set("wiki.language", language);
}
conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(Text.class);
conf.setMapRunnerClass(MyMapRunner.class);
conf.setReducerClass(IdentityReducer.class);
// Delete the output directory if it exists already.
fs.delete(new Path(tmpPath), true);
RunningJob job = JobClient.runJob(conf);
Counters counters = job.getCounters();
int blocks = (int) counters.getCounter(Blocks.Total);
LOG.info("number of blocks: " + blocks);
LOG.info("Writing index file...");
LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
FSDataOutputStream out = fs.create(new Path(indexFile), true);
out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
out.writeUTF(inputPath.toString());
out.writeInt(blocks);
int cnt = 0;
Text line = new Text();
while (reader.readLine(line) > 0) {
String[] arr = line.toString().split("\\s+");
int docno = Integer.parseInt(arr[0]);
int offset = Integer.parseInt(arr[1]);
short fileno = Short.parseShort(arr[2]);
out.writeInt(docno);
out.writeInt(offset);
out.writeShort(fileno);
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " blocks written");
}
}
reader.close();
out.close();
if (cnt != blocks) {
throw new RuntimeException("Error: mismatch in block count!");
}
// Clean up.
fs.delete(new Path(tmpPath), true);
return 0;
}
use of org.apache.hadoop.util.LineReader in project hadoop by apache.
the class NLineInputFormat method getSplitsForFile.
public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit) throws IOException {
List<FileSplit> splits = new ArrayList<FileSplit>();
Path fileName = status.getPath();
if (status.isDirectory()) {
throw new IOException("Not a file: " + fileName);
}
FileSystem fs = fileName.getFileSystem(conf);
LineReader lr = null;
try {
FSDataInputStream in = fs.open(fileName);
lr = new LineReader(in, conf);
Text line = new Text();
int numLines = 0;
long begin = 0;
long length = 0;
int num = -1;
while ((num = lr.readLine(line)) > 0) {
numLines++;
length += num;
if (numLines == numLinesPerSplit) {
splits.add(createFileSplit(fileName, begin, length));
begin += length;
length = 0;
numLines = 0;
}
}
if (numLines != 0) {
splits.add(createFileSplit(fileName, begin, length));
}
} finally {
if (lr != null) {
lr.close();
}
}
return splits;
}
use of org.apache.hadoop.util.LineReader in project hadoop by apache.
the class TestTextInputFormat method testNewLines.
/**
* Test readLine for various kinds of line termination sequneces.
* Varies buffer size to stress test. Also check that returned
* value matches the string length.
*
* @throws Exception
*/
@Test(timeout = 5000)
public void testNewLines() throws Exception {
final String STR = "a\nbb\n\nccc\rdddd\r\r\r\n\r\neeeee";
final int STRLENBYTES = STR.getBytes().length;
Text out = new Text();
for (int bufsz = 1; bufsz < STRLENBYTES + 1; ++bufsz) {
LineReader in = makeStream(STR, bufsz);
int c = 0;
//"a"\n
c += in.readLine(out);
assertEquals("line1 length, bufsz:" + bufsz, 1, out.getLength());
//"bb"\n
c += in.readLine(out);
assertEquals("line2 length, bufsz:" + bufsz, 2, out.getLength());
//""\n
c += in.readLine(out);
assertEquals("line3 length, bufsz:" + bufsz, 0, out.getLength());
//"ccc"\r
c += in.readLine(out);
assertEquals("line4 length, bufsz:" + bufsz, 3, out.getLength());
//dddd\r
c += in.readLine(out);
assertEquals("line5 length, bufsz:" + bufsz, 4, out.getLength());
//""\r
c += in.readLine(out);
assertEquals("line6 length, bufsz:" + bufsz, 0, out.getLength());
//""\r\n
c += in.readLine(out);
assertEquals("line7 length, bufsz:" + bufsz, 0, out.getLength());
//""\r\n
c += in.readLine(out);
assertEquals("line8 length, bufsz:" + bufsz, 0, out.getLength());
//"eeeee"EOF
c += in.readLine(out);
assertEquals("line9 length, bufsz:" + bufsz, 5, out.getLength());
assertEquals("end of file, bufsz: " + bufsz, 0, in.readLine(out));
assertEquals("total bytes, bufsz: " + bufsz, c, STRLENBYTES);
}
}
Aggregations