Search in sources :

Example 26 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class TrecDocnoMapping method writeMappingData.

/**
   * Creates a mappings file from the contents of a flat text file containing docid to docno
   * mappings. This method is used by {@link TrecDocnoMappingBuilder} internally.
   *
   * @param input flat text file containing docid to docno mappings
   * @param output output mappings data file
   * @param fs {@code FileSystem} to write to
   * @throws IOException
   */
public static void writeMappingData(Path input, Path output, FileSystem fs) throws IOException {
    Preconditions.checkNotNull(input);
    Preconditions.checkNotNull(output);
    Preconditions.checkNotNull(fs);
    LOG.info("Writing docno data to " + output);
    LineReader reader = new LineReader(fs.open(input));
    List<String> list = Lists.newArrayList();
    LOG.info("Reading " + input);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        list.add(arr[0]);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    reader.close();
    LOG.info(cnt + " docs total. Done!");
    cnt = 0;
    LOG.info("Writing " + output);
    FSDataOutputStream out = fs.create(output, true);
    out.writeInt(list.size());
    for (int i = 0; i < list.size(); i++) {
        out.writeUTF(list.get(i));
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    out.close();
    LOG.info(cnt + " docs total. Done!");
}
Also used : LineReader(org.apache.hadoop.util.LineReader) Text(org.apache.hadoop.io.Text) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 27 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class WikipediaDocnoMapping method writeDocnoMappingData.

/**
   * Creates a mappings file from the contents of a flat text file containing docid to docno
   * mappings. This method is used by {@link WikipediaDocnoMappingBuilder} internally.
   *
   * @param inputFile flat text file containing docid to docno mappings
   * @param outputFile output mappings file
   * @throws IOException
   */
public static void writeDocnoMappingData(FileSystem fs, String inputFile, int n, String outputFile) throws IOException {
    LOG.info("Writing " + n + " docids to " + outputFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));
    int cnt = 0;
    Text line = new Text();
    FSDataOutputStream out = fs.create(new Path(outputFile), true);
    out.writeInt(n);
    for (int i = 0; i < n; i++) {
        reader.readLine(line);
        String[] arr = line.toString().split("\\t");
        out.writeInt(Integer.parseInt(arr[0]));
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " articles");
        }
    }
    out.close();
    LOG.info("Done!");
}
Also used : Path(org.apache.hadoop.fs.Path) LineReader(org.apache.hadoop.util.LineReader) Text(org.apache.hadoop.io.Text) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 28 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class WikipediaForwardIndexBuilder method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg().withDescription("two-letter language code").create(LANGUAGE_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);
    String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);
    if (!inputPath.isAbsolute()) {
        System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
        return -1;
    }
    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }
    JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);
    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - language: " + language);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
    conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));
    conf.setNumReduceTasks(1);
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);
    if (language != null) {
        conf.set("wiki.language", language);
    }
    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);
    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpPath), true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int blocks = (int) counters.getCounter(Blocks.Total);
    LOG.info("number of blocks: " + blocks);
    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);
    out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");
        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);
        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }
    reader.close();
    out.close();
    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }
    // Clean up.
    fs.delete(new Path(tmpPath), true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 29 with LineReader

use of org.apache.hadoop.util.LineReader in project hadoop by apache.

the class NLineInputFormat method getSplitsForFile.

public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit) throws IOException {
    List<FileSplit> splits = new ArrayList<FileSplit>();
    Path fileName = status.getPath();
    if (status.isDirectory()) {
        throw new IOException("Not a file: " + fileName);
    }
    FileSystem fs = fileName.getFileSystem(conf);
    LineReader lr = null;
    try {
        FSDataInputStream in = fs.open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                splits.add(createFileSplit(fileName, begin, length));
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(createFileSplit(fileName, begin, length));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) ArrayList(java.util.ArrayList) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException)

Example 30 with LineReader

use of org.apache.hadoop.util.LineReader in project hadoop by apache.

the class TestTextInputFormat method testNewLines.

/**
   * Test readLine for various kinds of line termination sequneces.
   * Varies buffer size to stress test.  Also check that returned
   * value matches the string length.
   *
   * @throws Exception
   */
@Test(timeout = 5000)
public void testNewLines() throws Exception {
    final String STR = "a\nbb\n\nccc\rdddd\r\r\r\n\r\neeeee";
    final int STRLENBYTES = STR.getBytes().length;
    Text out = new Text();
    for (int bufsz = 1; bufsz < STRLENBYTES + 1; ++bufsz) {
        LineReader in = makeStream(STR, bufsz);
        int c = 0;
        //"a"\n
        c += in.readLine(out);
        assertEquals("line1 length, bufsz:" + bufsz, 1, out.getLength());
        //"bb"\n
        c += in.readLine(out);
        assertEquals("line2 length, bufsz:" + bufsz, 2, out.getLength());
        //""\n
        c += in.readLine(out);
        assertEquals("line3 length, bufsz:" + bufsz, 0, out.getLength());
        //"ccc"\r
        c += in.readLine(out);
        assertEquals("line4 length, bufsz:" + bufsz, 3, out.getLength());
        //dddd\r
        c += in.readLine(out);
        assertEquals("line5 length, bufsz:" + bufsz, 4, out.getLength());
        //""\r
        c += in.readLine(out);
        assertEquals("line6 length, bufsz:" + bufsz, 0, out.getLength());
        //""\r\n
        c += in.readLine(out);
        assertEquals("line7 length, bufsz:" + bufsz, 0, out.getLength());
        //""\r\n
        c += in.readLine(out);
        assertEquals("line8 length, bufsz:" + bufsz, 0, out.getLength());
        //"eeeee"EOF
        c += in.readLine(out);
        assertEquals("line9 length, bufsz:" + bufsz, 5, out.getLength());
        assertEquals("end of file, bufsz: " + bufsz, 0, in.readLine(out));
        assertEquals("total bytes, bufsz: " + bufsz, c, STRLENBYTES);
    }
}
Also used : LineReader(org.apache.hadoop.util.LineReader) Text(org.apache.hadoop.io.Text) Test(org.junit.Test)

Aggregations

LineReader (org.apache.hadoop.util.LineReader)36 Text (org.apache.hadoop.io.Text)31 Path (org.apache.hadoop.fs.Path)15 FileSystem (org.apache.hadoop.fs.FileSystem)14 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)11 Test (org.junit.Test)10 Configuration (org.apache.hadoop.conf.Configuration)5 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 Counters (org.apache.hadoop.mapred.Counters)4 JobConf (org.apache.hadoop.mapred.JobConf)4 RunningJob (org.apache.hadoop.mapred.RunningJob)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 CommandLine (org.apache.commons.cli.CommandLine)3 CommandLineParser (org.apache.commons.cli.CommandLineParser)3 GnuParser (org.apache.commons.cli.GnuParser)3 HelpFormatter (org.apache.commons.cli.HelpFormatter)3 Options (org.apache.commons.cli.Options)3 ParseException (org.apache.commons.cli.ParseException)3 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)3