Search in sources :

Example 11 with Counters

use of org.apache.hadoop.mapred.Counters in project hive by apache.

the class HadoopJobExecHelper method checkFatalErrors.

public boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg) {
    if (ctrs == null) {
        // we may still be able to retrieve the job status - so ignore
        return false;
    }
    // check for number of created files
    Counters.Counter cntr = ctrs.findCounter(HiveConf.getVar(job, ConfVars.HIVECOUNTERGROUP), Operator.HIVE_COUNTER_CREATED_FILES);
    long numFiles = cntr != null ? cntr.getValue() : 0;
    long upperLimit = HiveConf.getLongVar(job, HiveConf.ConfVars.MAXCREATEDFILES);
    if (numFiles > upperLimit) {
        errMsg.append("total number of created files now is " + numFiles + ", which exceeds ").append(upperLimit);
        return true;
    }
    return this.callBackObj.checkFatalErrors(ctrs, errMsg);
}
Also used : Counters(org.apache.hadoop.mapred.Counters) Counter(org.apache.hadoop.mapred.Counters.Counter)

Example 12 with Counters

use of org.apache.hadoop.mapred.Counters in project Cloud9 by lintool.

the class ClueWarcForwardIndexBuilder method run.

/**
   * Runs this tool.
   */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path (must be block-compressed SequenceFiles)").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path").create(INDEX_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    JobConf conf = new JobConf(getConf(), ClueWarcForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);
    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_OPTION);
    LOG.info("Tool name: " + ClueWarcForwardIndexBuilder.class.getSimpleName());
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
    Random random = new Random();
    Path outputPath = new Path("tmp-" + ClueWarcForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000));
    conf.setJobName(ClueWarcForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath);
    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(1);
    // thinks its a MapFile.
    for (FileStatus status : fs.listStatus(new Path(collectionPath))) {
        FileInputFormat.addInputPath(conf, status.getPath());
    }
    FileOutputFormat.setOutputPath(conf, outputPath);
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);
    // delete the output directory if it exists already
    fs.delete(outputPath, true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();
    LOG.info("number of blocks: " + blocks);
    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);
    out.writeUTF(ClueWarcForwardIndex.class.getCanonicalName());
    out.writeUTF(collectionPath);
    out.writeInt(blocks);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");
        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);
        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }
    reader.close();
    out.close();
    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }
    fs.delete(outputPath, true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) FileStatus(org.apache.hadoop.fs.FileStatus) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 13 with Counters

use of org.apache.hadoop.mapred.Counters in project Cloud9 by lintool.

the class CountClueWarcRecords method run.

/**
   * Runs this tool.
   */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(new Option(ORIGINAL_OPTION, "use original ClueWeb09 distribution"));
    options.addOption(new Option(REPACKED_OPTION, "use repacked SequenceFiles"));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("path: base path for 'original', actual path for 'repacked'").create(PATH_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("DocnoMapping data path").create(MAPPING_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("segment number (required if 'original')").create(SEGMENT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file to write the number of records").create(COUNT_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    boolean repacked;
    if (cmdline.hasOption(REPACKED_OPTION)) {
        repacked = true;
    } else if (cmdline.hasOption(ORIGINAL_OPTION)) {
        repacked = false;
    } else {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Expecting either -original or -repacked");
        return -1;
    }
    if (!cmdline.hasOption(PATH_OPTION) || !cmdline.hasOption(MAPPING_OPTION) || (!repacked && !cmdline.hasOption(SEGMENT_OPTION))) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    String path = cmdline.getOptionValue(PATH_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_OPTION);
    int segment = 1;
    if (!repacked) {
        segment = Integer.parseInt(cmdline.getOptionValue(SEGMENT_OPTION));
    }
    LOG.info("Tool name: " + CountClueWarcRecords.class.getSimpleName());
    LOG.info(" - repacked: " + repacked);
    LOG.info(" - path: " + path);
    LOG.info(" - mapping file: " + mappingFile);
    if (!repacked) {
        LOG.info(" - segment number: " + segment);
    }
    FileSystem fs = FileSystem.get(getConf());
    int mapTasks = 10;
    JobConf conf = new JobConf(getConf(), CountClueWarcRecords.class);
    conf.setJobName(CountClueWarcRecords.class.getSimpleName() + (repacked ? ":" + path : ":segment" + segment));
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    if (repacked) {
        // thinks its a MapFile.
        for (FileStatus status : fs.listStatus(new Path(path))) {
            FileInputFormat.addInputPath(conf, status.getPath());
        }
    } else {
        ClueCollectionPathConstants.addEnglishCollectionPart(conf, path, segment);
    }
    DistributedCache.addCacheFile(new URI(mappingFile), conf);
    if (repacked) {
        conf.setInputFormat(SequenceFileInputFormat.class);
    } else {
        conf.setInputFormat(ClueWarcInputFormat.class);
    }
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(MyMapper.class);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();
    LOG.info("Read " + numDocs + " docs.");
    if (cmdline.hasOption(COUNT_OPTION)) {
        String f = cmdline.getOptionValue(COUNT_OPTION);
        FSDataOutputStream out = fs.create(new Path(f));
        out.write(new Integer(numDocs).toString().getBytes());
        out.close();
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) FileStatus(org.apache.hadoop.fs.FileStatus) GnuParser(org.apache.commons.cli.GnuParser) URI(java.net.URI) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) Option(org.apache.commons.cli.Option) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 14 with Counters

use of org.apache.hadoop.mapred.Counters in project Cloud9 by lintool.

the class Aquaint2ForwardIndexBuilder method runTool.

public int runTool(Configuration config, String collectionPath, String outputPath, String indexFile, String mappingFile) throws Exception {
    //sLogger.error ("getConf(): " + getConf() + ", DemoCountAquaint2Documents.class: " + DemoCountAquaint2Documents.class);
    JobConf conf = new JobConf(config, DemoCountAquaint2Documents.class);
    FileSystem fs = FileSystem.get(config);
    sLogger.info("Tool name: BuildAquaint2ForwardIndex");
    sLogger.info(" - collection path: " + collectionPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - index file: " + indexFile);
    sLogger.info(" - mapping file: " + mappingFile);
    conf.setJobName("BuildAquaint2ForwardIndex");
    conf.set("mapred.child.java.opts", "-Xmx1024m");
    conf.setNumReduceTasks(1);
    if (conf.get("mapred.job.tracker").equals("local")) {
        conf.set("DocnoMappingFile", mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), conf);
    }
    FileInputFormat.setInputPaths(conf, new Path(collectionPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(Aquaint2DocumentInputFormatOld.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Count.DOCS).getCounter();
    String inputFile = outputPath + "/" + "part-00000";
    sLogger.info("Writing " + numDocs + " doc offseta to " + indexFile);
    LineReader reader = new LineReader(fs.open(new Path(inputFile)));
    FSDataOutputStream writer = fs.create(new Path(indexFile), true);
    writer.writeUTF("edu.umd.cloud9.collection.aquaint2.Aquaint2ForwardIndex");
    writer.writeUTF(collectionPath);
    writer.writeInt(numDocs);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\t");
        long offset = Long.parseLong(arr[1]);
        int len = Integer.parseInt(arr[2]);
        // sLogger.info(arr[0] + " " + offset + " " + len);
        writer.writeLong(offset);
        writer.writeInt(len);
        cnt++;
        if (cnt % 100000 == 0) {
            sLogger.info(cnt + " docs");
        }
    }
    reader.close();
    writer.close();
    sLogger.info(cnt + " docs total. Done!");
    if (numDocs != cnt) {
        throw new RuntimeException("Unexpected number of documents in building forward index!");
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) URI(java.net.URI) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 15 with Counters

use of org.apache.hadoop.mapred.Counters in project Cloud9 by lintool.

the class WikipediaForwardIndexBuilder method run.

@SuppressWarnings("static-access")
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg().withDescription("two-letter language code").create(LANGUAGE_OPTION));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }
    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }
    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);
    String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);
    if (!inputPath.isAbsolute()) {
        System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
        return -1;
    }
    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }
    JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);
    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - language: " + language);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
    conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));
    conf.setNumReduceTasks(1);
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);
    if (language != null) {
        conf.set("wiki.language", language);
    }
    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);
    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpPath), true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int blocks = (int) counters.getCounter(Blocks.Total);
    LOG.info("number of blocks: " + blocks);
    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);
    out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");
        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);
        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }
    reader.close();
    out.close();
    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }
    // Clean up.
    fs.delete(new Path(tmpPath), true);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) Text(org.apache.hadoop.io.Text) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

Counters (org.apache.hadoop.mapred.Counters)25 JobConf (org.apache.hadoop.mapred.JobConf)14 RunningJob (org.apache.hadoop.mapred.RunningJob)14 Path (org.apache.hadoop.fs.Path)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 Counter (org.apache.hadoop.mapred.Counters.Counter)6 JobClient (org.apache.hadoop.mapred.JobClient)6 IOException (java.io.IOException)5 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)5 Text (org.apache.hadoop.io.Text)5 ArrayList (java.util.ArrayList)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 LineReader (org.apache.hadoop.util.LineReader)4 BufferedReader (java.io.BufferedReader)3 DataOutputStream (java.io.DataOutputStream)3 InputStream (java.io.InputStream)3 InputStreamReader (java.io.InputStreamReader)3 CommandLine (org.apache.commons.cli.CommandLine)3 CommandLineParser (org.apache.commons.cli.CommandLineParser)3 GnuParser (org.apache.commons.cli.GnuParser)3