Search in sources :

Example 41 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.

the class ClueWarcForwardIndexBuilder method run.

/**
   * Runs this tool.
   */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path (must be block-compressed SequenceFiles)").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path").create(INDEX_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    JobConf conf = new JobConf(getConf(), ClueWarcForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);
    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);
    LOG.info("Tool name: " + ClueWarcForwardIndexBuilder.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
    Random random = new Random();
    Path outputPath = new Path("tmp-" + ClueWarcForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000));
    conf.setJobName(ClueWarcForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(1);
    // thinks its a MapFile.
    for (FileStatus status : fs.listStatus(new Path(collectionPath))) {
        FileInputFormat.addInputPath(conf, status.getPath());
    }
    FileOutputFormat.setOutputPath(conf, outputPath);
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);
    // delete the output directory if it exists already
    fs.delete(outputPath, true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();
    LOG.info("number of blocks: " + blocks);
    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);
    out.writeUTF(ClueWarcForwardIndex.class.getCanonicalName());
    out.writeUTF(collectionPath);
    out.writeInt(blocks);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");
        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);
        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }
    reader.close();
    out.close();
    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }
    fs.delete(outputPath, true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) FileStatus(org.apache.hadoop.fs.FileStatus) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 42 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.

the class CountClueWarcRecords method run.

/**
   * Runs this tool.
   */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(new Option(ORIGINAL_OPTION, "use original ClueWeb09 distribution"));
    options.addOption(new Option(REPACKED_OPTION, "use repacked SequenceFiles"));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("path: base path for 'original', actual path for 'repacked'").create(PATH_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("DocnoMapping data path").create(MAPPING_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("segment number (required if 'original')").create(SEGMENT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file to write the number of records").create(COUNT_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    boolean repacked;
    if (cmdline.hasOption(REPACKED_OPTION)) {
        repacked = true;
    } else if (cmdline.hasOption(ORIGINAL_OPTION)) {
        repacked = false;
    } else {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Expecting either -original or -repacked");
        return -1;
    }
    if (!cmdline.hasOption(PATH_OPTION) || !cmdline.hasOption(MAPPING_OPTION) || (!repacked && !cmdline.hasOption(SEGMENT_OPTION))) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String path = cmdline.getOptionValue(PATH_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    int segment = 1;
    if (!repacked) {
        segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT_OPTION));
    }
    LOG.info("Tool name: " + CountClueWarcRecords.class.getSimpleName());
    LOG.info(" - repacked: " + repacked);
    LOG.info(" - path: " + path);
    LOG.info(" - mapping file: " + mappingFile);
    if (!repacked) {
        LOG.info(" - segment number: " + segment);
    }
    FileSystem fs = FileSystem.get(getConf());
    int mapTasks = 10;
    JobConf conf = new JobConf(getConf(), CountClueWarcRecords.class);
    conf.setJobName(CountClueWarcRecords.class.getSimpleName() + (repacked ? ":" + path : ":segment" + segment));
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    if (repacked) {
        // thinks its a MapFile.
        for (FileStatus status : fs.listStatus(new Path(path))) {
            FileInputFormat.addInputPath(conf, status.getPath());
        }
    } else {
        ClueCollectionPathConstants.addEnglishCollectionPart(conf, path, segment);
    }
    DistributedCache.addCacheFile(new URI(mappingFile), conf);
    if (repacked) {
        conf.setInputFormat(SequenceFileInputFormat.class);
    } else {
        conf.setInputFormat(ClueWarcInputFormat.class);
    }
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(MyMapper.class);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();
    LOG.info("Read " + numDocs + " docs.");
    if (cmdline.hasOption(COUNT_OPTION)) {
        String f = cmdline.getOptionValue(COUNT_OPTION);
        FSDataOutputStream out = fs.create(new Path(f));
        out.write(new Integer(numDocs).toString().getBytes());
        out.close();
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) FileStatus(org.apache.hadoop.fs.FileStatus) GnuParser(org.apache.commons.cli.GnuParser) URI(java.net.URI) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) Option(org.apache.commons.cli.Option) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 43 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.

the class Aquaint2ForwardIndexBuilder method runTool.

public int runTool(Configuration config, String collectionPath, String outputPath, String indexFile, String mappingFile) throws Exception {
    //sLogger.error ("getConf(): " + getConf() + ", DemoCountAquaint2Documents.class: " + DemoCountAquaint2Documents.class);
    JobConf conf = new JobConf(config, DemoCountAquaint2Documents.class);
    FileSystem fs = FileSystem.get(config);
    sLogger.info("Tool name: BuildAquaint2ForwardIndex");
    sLogger.info(" - collection path: " + collectionPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - index file: " + indexFile);
    sLogger.info(" - mapping file: " + mappingFile);
    conf.setJobName("BuildAquaint2ForwardIndex");
    conf.set("mapred.child.java.opts", "-Xmx1024m");
    conf.setNumReduceTasks(1);
    if (conf.get("mapred.job.tracker").equals("local")) {
        conf.set("DocnoMappingFile", mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), conf);
    }
    FileInputFormat.setInputPaths(conf, new Path(collectionPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(Aquaint2DocumentInputFormatOld.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getCounter();
    String inputFile = outputPath + "/" + "part-00000";
    sLogger.info("Writing " + numDocs + " doc offseta to " + indexFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));
    FSDataOutputStream writer = fs.create(new Path(indexFile), true);
    writer.writeUTF("edu.umd.cloud9.collection.aquaint2.Aquaint2ForwardIndex");
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);
        // sLogger.info(arr[0] + " " + offset + " " + len);
        writer.writeLong(offset);
        writer.writeInt(len);
        cnt++;
        if (cnt % 100000 == 0) {
            sLogger.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    sLogger.info(cnt + " docs total. Done!");
    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) URI(java.net.URI) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 44 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project Cloud9 by lintool.

the class WikipediaForwardIndexBuilder method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg().withDescription("two-letter language code").create(LANGUAGE_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);
    String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);
    if (!inputPath.isAbsolute()) {
        System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
        return -1;
    }
    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }
    JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);
    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - language: " + language);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
    conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));
    conf.setNumReduceTasks(1);
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);
    if (language != null) {
        conf.set("wiki.language", language);
    }
    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);
    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpPath), true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int blocks = (int) counters.getCounter(Blocks.Total);
    LOG.info("number of blocks: " + blocks);
    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);
    out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");
        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);
        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }
    reader.close();
    out.close();
    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }
    // Clean up.
    fs.delete(new Path(tmpPath), true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 45 with RunningJob

use of org.apache.hadoop.mapred.RunningJob in project hadoop by apache.

the class TestMultithreadedMapRunner method run.

private void run(boolean ioEx, boolean rtEx) throws Exception {
    Path inDir = new Path("testing/mt/input");
    Path outDir = new Path("testing/mt/output");
    // Hack for local FS that does not have the concept of a 'mounting point'
    if (isLocalFS()) {
        String localPathRoot = System.getProperty("test.build.data", "/tmp").replace(' ', '+');
        inDir = new Path(localPathRoot, inDir);
        outDir = new Path(localPathRoot, outDir);
    }
    JobConf conf = createJobConf();
    FileSystem fs = FileSystem.get(conf);
    fs.delete(outDir, true);
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Mkdirs failed to create " + inDir.toString());
    }
    {
        DataOutputStream file = fs.create(new Path(inDir, "part-0"));
        file.writeBytes("a\nb\n\nc\nd\ne");
        file.close();
    }
    conf.setJobName("mt");
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(LongWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(IDMap.class);
    conf.setReducerClass(IDReduce.class);
    FileInputFormat.setInputPaths(conf, inDir);
    FileOutputFormat.setOutputPath(conf, outDir);
    conf.setMapRunnerClass(MultithreadedMapRunner.class);
    conf.setInt(MultithreadedMapper.NUM_THREADS, 2);
    if (ioEx) {
        conf.setBoolean("multithreaded.ioException", true);
    }
    if (rtEx) {
        conf.setBoolean("multithreaded.runtimeException", true);
    }
    JobClient jc = new JobClient(conf);
    RunningJob job = jc.submitJob(conf);
    while (!job.isComplete()) {
        Thread.sleep(100);
    }
    if (job.isSuccessful()) {
        assertFalse(ioEx || rtEx);
    } else {
        assertTrue(ioEx || rtEx);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) DataOutputStream(java.io.DataOutputStream) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) IOException(java.io.IOException) JobConf(org.apache.hadoop.mapred.JobConf) JobClient(org.apache.hadoop.mapred.JobClient)

Aggregations

RunningJob (org.apache.hadoop.mapred.RunningJob)61 JobConf (org.apache.hadoop.mapred.JobConf)45 Path (org.apache.hadoop.fs.Path)35 FileSystem (org.apache.hadoop.fs.FileSystem)24 JobClient (org.apache.hadoop.mapred.JobClient)20 IOException (java.io.IOException)15 Counters (org.apache.hadoop.mapred.Counters)14 Group (org.apache.hadoop.mapred.Counters.Group)13 DMLConfig (org.apache.sysml.conf.DMLConfig)13 Configuration (org.apache.hadoop.conf.Configuration)7 MatrixChar_N_ReducerGroups (org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups)7 DataOutputStream (java.io.DataOutputStream)6 File (java.io.File)5 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)5 FileStatus (org.apache.hadoop.fs.FileStatus)5 Text (org.apache.hadoop.io.Text)5 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)5 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)5 Test (org.junit.Test)5 URI (java.net.URI)4