Search in sources :

Example 16 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class TrecForwardIndexBuilder method run.

/**
   * Runs this tool.
   */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) DocnoMapping data").create(MAPPING_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(MAPPING_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    String tmpDir = "tmp-" + TrecForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000);
    Job job = new Job(getConf(), TrecForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
    job.setJarByClass(TrecForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(getConf());
    LOG.info("Tool name: " + TrecForwardIndexBuilder.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - DocnoMapping file: " + mappingFile);
    LOG.info(" - temp output directory: " + tmpDir);
    job.setNumReduceTasks(1);
    if (job.getConfiguration().get("mapred.job.tracker").equals("local")) {
        job.getConfiguration().set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), job.getConfiguration());
    }
    FileInputFormat.setInputPaths(job, new Path(collectionPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);
    job.setInputFormatClass(TrecDocumentInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(MyMapper.class);
    // delete the output directory if it exists already
    FileSystem.get(getConf()).delete(new Path(tmpDir), true);
    job.waitForCompletion(true);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getValue();
    String inputFile = tmpDir + "/" + "part-r-00000";
    LOG.info("Writing " + numDocs + " doc offseta to " + indexFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));
    FSDataOutputStream writer = fs.create(new Path(indexFile), true);
    writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName());
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);
        writer.writeLong(offset);
        writer.writeInt(len);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    LOG.info(cnt + " docs total. Done!");
    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }
    fs.delete(new Path(tmpDir), true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) URI(java.net.URI) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) Counters(org.apache.hadoop.mapreduce.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Job(org.apache.hadoop.mapreduce.Job)

Example 17 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class TextDocnoMapping method writeDocnoData.

/**
	 * Creates a mappings file from the contents of a flat text file containing
	 * docid to docno mappings. This method is used by
	 * {@link TrecDocnoMappingBuilder} internally.
	 * 
	 * @param inputFile
	 *            flat text file containing docid to docno mappings
	 * @param outputFile
	 *            output mappings file
	 * @param fs
	 *            FileSystem to write to
	 * @throws IOException
	 */
public static void writeDocnoData(String inputFile, String outputFile, FileSystem fs) throws IOException {
    sLogger.info("Writing docno data to " + outputFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));
    List<String> list = new ArrayList<String>();
    sLogger.info("Reading " + inputFile);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        list.add(arr[0]);
        cnt++;
        if (cnt % 100000 == 0) {
            sLogger.info(cnt + " docs");
        }
    }
    reader.close();
    sLogger.info(cnt + " docs total. Done!");
    cnt = 0;
    sLogger.info("Writing " + outputFile);
    FSDataOutputStream out = fs.create(new Path(outputFile), true);
    out.writeInt(list.size());
    for (int i = 0; i < list.size(); i++) {
        out.writeUTF(list.get(i));
        cnt++;
        if (cnt % 100000 == 0) {
            sLogger.info(cnt + " docs");
        }
    }
    out.close();
    sLogger.info(cnt + " docs total. Done!");
}
Also used : Path(org.apache.hadoop.fs.Path) LineReader(org.apache.hadoop.util.LineReader) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 18 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class ClueWarcDocnoMapping method loadMapping.

@Override
public void loadMapping(Path p, FileSystem fs) throws IOException {
    LineReader reader = new LineReader(fs.open(p));
    Text t = new Text();
    int cnt = 0;
    String prevSec = null;
    while (reader.readLine(t) > 0) {
        String[] arr = t.toString().split(",");
        if (prevSec == null || !arr[0].equals(prevSec)) {
            subdirMapping.put(arr[0], cnt);
        }
        offets[cnt] = Integer.parseInt(arr[3]);
        prevSec = arr[0];
        cnt++;
    }
    reader.close();
}
Also used : LineReader(org.apache.hadoop.util.LineReader) Text(org.apache.hadoop.io.Text)

Example 19 with LineReader

use of org.apache.hadoop.util.LineReader in project Cloud9 by lintool.

the class BuildAnchorTextForwardIndex method run.

/**
	 * Runs this tool.
	 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }
    JobConf conf = new JobConf(getConf());
    FileSystem fs = FileSystem.get(conf);
    String collectionPath = args[0];
    String outputPath = args[1];
    String indexFile = args[2];
    LOG.info("Tool name: BuildAnchorTextForwardIndex");
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setJobName("BuildAnchorTextForwardIndex");
    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(1);
    FileInputFormat.setInputPaths(conf, new Path(collectionPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);
    // delete the output directory if it exists already
    fs.delete(new Path(outputPath), true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();
    LOG.info("number of blocks: " + blocks);
    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);
    out.writeUTF(IndexableAnchorTextForwardIndex.class.getName());
    out.writeUTF(collectionPath);
    out.writeInt(blocks);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");
        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);
        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);
        cnt++;
        if (cnt % 1000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }
    reader.close();
    out.close();
    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) AnchorText(edu.umd.cloud9.webgraph.data.AnchorText) IndexableAnchorTextForwardIndex(edu.umd.cloud9.webgraph.data.IndexableAnchorTextForwardIndex) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 20 with LineReader

use of org.apache.hadoop.util.LineReader in project jena by apache.

the class AbstractLineBasedNodeTupleReader method initialize.

@Override
public final void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    LOG.debug("initialize({}, {})", genericSplit, context);
    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    FileSplit split = (FileSplit) genericSplit;
    // Intermediate : RDFParser but need to make a Iterator<Quad/Triple>
    LabelToNode labelToNode = RdfIOUtils.createLabelToNode(context, split.getPath());
    maker = new ParserProfileStd(RiotLib.factoryRDF(labelToNode), ErrorHandlerFactory.errorHandlerStd, IRIResolver.create(), PrefixMapFactory.createForInput(), null, true, false);
    Configuration config = context.getConfiguration();
    this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
    if (this.ignoreBadTuples)
        LOG.warn("Configured to ignore bad tuples, parsing errors will be logged and the bad line skipped but no errors will be thrownConsider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);
    // Figure out what portion of the file to read
    this.maxLineLength = config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
    compressionCodecs = new CompressionCodecFactory(config);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { start, split.getLength(), totalLength }));
    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(config);
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;
    if (codec != null) {
        // Add 1 and verify we got complete split
        if (totalLength > split.getLength() + 1)
            throw new IOException("This record reader can only be used with compressed input where the split covers the whole file");
        in = new LineReader(codec.createInputStream(fileIn), config);
        estLength = end;
        end = Long.MAX_VALUE;
    } else {
        // Uncompressed input
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, config);
    }
    // NLineInputFormat will provide the split information to use
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) LabelToNode(org.apache.jena.riot.lang.LabelToNode) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Aggregations

LineReader (org.apache.hadoop.util.LineReader)36 Text (org.apache.hadoop.io.Text)31 Path (org.apache.hadoop.fs.Path)15 FileSystem (org.apache.hadoop.fs.FileSystem)14 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)11 Test (org.junit.Test)10 Configuration (org.apache.hadoop.conf.Configuration)5 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 Counters (org.apache.hadoop.mapred.Counters)4 JobConf (org.apache.hadoop.mapred.JobConf)4 RunningJob (org.apache.hadoop.mapred.RunningJob)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 CommandLine (org.apache.commons.cli.CommandLine)3 CommandLineParser (org.apache.commons.cli.CommandLineParser)3 GnuParser (org.apache.commons.cli.GnuParser)3 HelpFormatter (org.apache.commons.cli.HelpFormatter)3 Options (org.apache.commons.cli.Options)3 ParseException (org.apache.commons.cli.ParseException)3 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)3