Search in sources :

Example 21 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class ClueWarcForwardIndexBuilder method run.

/**
   * Runs this tool.
   */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path (must be block-compressed SequenceFiles)").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path").create(INDEX_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    JobConf conf = new JobConf(getConf(), ClueWarcForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);
    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);
    LOG.info("Tool name: " + ClueWarcForwardIndexBuilder.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
    Random random = new Random();
    Path outputPath = new Path("tmp-" + ClueWarcForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000));
    conf.setJobName(ClueWarcForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(1);
    // thinks its a MapFile.
    for (FileStatus status : fs.listStatus(new Path(collectionPath))) {
        FileInputFormat.addInputPath(conf, status.getPath());
    }
    FileOutputFormat.setOutputPath(conf, outputPath);
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);
    // delete the output directory if it exists already
    fs.delete(outputPath, true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();
    LOG.info("number of blocks: " + blocks);
    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);
    out.writeUTF(ClueWarcForwardIndex.class.getCanonicalName());
    out.writeUTF(collectionPath);
    out.writeInt(blocks);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");
        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);
        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }
    reader.close();
    out.close();
    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }
    fs.delete(outputPath, true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) FileStatus(org.apache.hadoop.fs.FileStatus) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 22 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class Aquaint2DocnoMapping method writeDocnoData.

public static void writeDocnoData(Path input, Path output, FileSystem fs) throws IOException {
    LOG.info("Writing docno data to " + output);
    LineReader reader = new LineReader(fs.open(input));
    List<String> list = Lists.newArrayList();
    LOG.info("Reading " + input);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        list.add(arr[0]);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    reader.close();
    LOG.info(cnt + " docs total. Done!");
    cnt = 0;
    LOG.info("Writing " + output);
    FSDataOutputStream out = fs.create(output, true);
    out.writeInt(list.size());
    for (int i = 0; i < list.size(); i++) {
        out.writeUTF(list.get(i));
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    out.close();
    LOG.info(cnt + " docs total. Done!");
}
Also used : LineReader(org.apache.hadoop.util.LineReader) Text(org.apache.hadoop.io.Text) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 23 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class Aquaint2ForwardIndexBuilder method runTool.

public int runTool(Configuration config, String collectionPath, String outputPath, String indexFile, String mappingFile) throws Exception {
    //sLogger.error ("getConf(): " + getConf() + ", DemoCountAquaint2Documents.class: " + DemoCountAquaint2Documents.class);
    JobConf conf = new JobConf(config, DemoCountAquaint2Documents.class);
    FileSystem fs = FileSystem.get(config);
    sLogger.info("Tool name: BuildAquaint2ForwardIndex");
    sLogger.info(" - collection path: " + collectionPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - index file: " + indexFile);
    sLogger.info(" - mapping file: " + mappingFile);
    conf.setJobName("BuildAquaint2ForwardIndex");
    conf.set("mapred.child.java.opts", "-Xmx1024m");
    conf.setNumReduceTasks(1);
    if (conf.get("mapred.job.tracker").equals("local")) {
        conf.set("DocnoMappingFile", mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), conf);
    }
    FileInputFormat.setInputPaths(conf, new Path(collectionPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(Aquaint2DocumentInputFormatOld.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getCounter();
    String inputFile = outputPath + "/" + "part-00000";
    sLogger.info("Writing " + numDocs + " doc offseta to " + indexFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));
    FSDataOutputStream writer = fs.create(new Path(indexFile), true);
    writer.writeUTF("edu.umd.cloud9.collection.aquaint2.Aquaint2ForwardIndex");
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);
        // sLogger.info(arr[0] + " " + offset + " " + len);
        writer.writeLong(offset);
        writer.writeInt(len);
        cnt++;
        if (cnt % 100000 == 0) {
            sLogger.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    sLogger.info(cnt + " docs total. Done!");
    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) URI(java.net.URI) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 24 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class TrecWebDocnoMappingBuilder method writeMappingData.

private static void writeMappingData(Path input, Path output, FileSystem fs) throws IOException {
    LOG.info("Writing docids to " + output);
    LineReader reader = new LineReader(fs.open(input));
    LOG.info("Reading " + input);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        cnt++;
    }
    reader.close();
    LOG.info("Done!");
    LOG.info("Writing " + output);
    FSDataOutputStream out = fs.create(output, true);
    reader = new LineReader(fs.open(input));
    out.writeInt(cnt);
    cnt = 0;
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        out.writeUTF(arr[0]);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " documents");
        }
    }
    reader.close();
    out.close();
    LOG.info("Done! " + cnt + " documents total.");
}
Also used : LineReader(org.apache.hadoop.util.LineReader) Text(org.apache.hadoop.io.Text) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 25 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class MedlineDocnoMapping method writeMappingData.

/**
   * Creates a mappings file from the contents of a flat text file containing docid to docno
   * mappings. This method is used by {@link MedlineDocnoMappingBuilder} internally.
   *
   * @param input flat text file containing docid to docno mappings
   * @param output output mappings file
   * @param fs reference to the file system
   * @throws IOException
   */
public static void writeMappingData(Path input, Path output, FileSystem fs) throws IOException {
    Preconditions.checkNotNull(input);
    Preconditions.checkNotNull(output);
    Preconditions.checkNotNull(fs);
    LOG.info("Writing docids to " + output);
    LineReader reader = new LineReader(fs.open(input));
    List<Integer> list = Lists.newArrayList();
    LOG.info("Reading " + input);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        list.add(Integer.parseInt(arr[0]));
        cnt++;
        if (cnt % 500000 == 0) {
            LOG.info(cnt);
        }
    }
    reader.close();
    LOG.info("Done! Total of " + cnt + " docids read.");
    cnt = 0;
    LOG.info("Writing " + output);
    FSDataOutputStream out = fs.create(output, true);
    out.writeInt(list.size());
    for (int i = 0; i < list.size(); i++) {
        out.writeInt(list.get(i));
        cnt++;
        if (cnt % 500000 == 0) {
            LOG.info(cnt);
        }
    }
    out.close();
    LOG.info("Done! Total of " + cnt + " docids written.");
}
Also used : LineReader(org.apache.hadoop.util.LineReader) Text(org.apache.hadoop.io.Text) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Aggregations

LineReader (org.apache.hadoop.util.LineReader)36 Text (org.apache.hadoop.io.Text)31 Path (org.apache.hadoop.fs.Path)15 FileSystem (org.apache.hadoop.fs.FileSystem)14 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)11 Test (org.junit.Test)10 Configuration (org.apache.hadoop.conf.Configuration)5 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 Counters (org.apache.hadoop.mapred.Counters)4 JobConf (org.apache.hadoop.mapred.JobConf)4 RunningJob (org.apache.hadoop.mapred.RunningJob)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 CommandLine (org.apache.commons.cli.CommandLine)3 CommandLineParser (org.apache.commons.cli.CommandLineParser)3 GnuParser (org.apache.commons.cli.GnuParser)3 HelpFormatter (org.apache.commons.cli.HelpFormatter)3 Options (org.apache.commons.cli.Options)3 ParseException (org.apache.commons.cli.ParseException)3 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)3