Search in sources :

Example 51 with Counter

use of org.apache.hadoop.mapreduce.Counter in project druid by druid-io.

the class HadoopDruidIndexerMapper method handleParseException.

private void handleParseException(ParseException pe, Context context) {
    context.getCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER).increment(1);
    Counter unparseableCounter = context.getCounter(HadoopDruidIndexerConfig.IndexJobCounters.ROWS_UNPARSEABLE_COUNTER);
    Counter processedWithErrorsCounter = context.getCounter(HadoopDruidIndexerConfig.IndexJobCounters.ROWS_PROCESSED_WITH_ERRORS_COUNTER);
    if (pe.isFromPartiallyValidRow()) {
        processedWithErrorsCounter.increment(1);
    } else {
        unparseableCounter.increment(1);
    }
    if (config.isLogParseExceptions()) {
        log.error(pe, "Encountered parse exception: ");
    }
    long rowsUnparseable = unparseableCounter.getValue();
    long rowsProcessedWithError = processedWithErrorsCounter.getValue();
    if (rowsUnparseable + rowsProcessedWithError > config.getMaxParseExceptions()) {
        log.error("Max parse exceptions exceeded, terminating task...");
        throw new RuntimeException("Max parse exceptions exceeded, terminating task...", pe);
    }
}
Also used : Counter(org.apache.hadoop.mapreduce.Counter)

Example 52 with Counter

use of org.apache.hadoop.mapreduce.Counter in project nutch by apache.

the class DeduplicationJob method run.

@Override
public int run(String[] args) throws IOException {
    if (args.length < 1) {
        System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<httpsOverHttp>,<urlLength>]");
        return 1;
    }
    String group = "none";
    Path crawlDb = new Path(args[0]);
    String compareOrder = "score,fetchTime,urlLength";
    for (int i = 1; i < args.length; i++) {
        if (args[i].equals("-group"))
            group = args[++i];
        if (args[i].equals("-compareOrder")) {
            compareOrder = args[++i];
            if (compareOrder.indexOf("score") == -1 || compareOrder.indexOf("fetchTime") == -1 || compareOrder.indexOf("urlLength") == -1) {
                System.err.println("DeduplicationJob: compareOrder must contain score, fetchTime and urlLength.");
                return 1;
            }
        }
    }
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("DeduplicationJob: starting at " + sdf.format(start));
    Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    Job job = NutchJob.getInstance(getConf());
    Configuration conf = job.getConfiguration();
    job.setJobName("Deduplication on " + crawlDb);
    conf.set(DEDUPLICATION_GROUP_MODE, group);
    conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
    job.setJarByClass(DeduplicationJob.class);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(CrawlDatum.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setMapperClass(DBFilter.class);
    job.setReducerClass(DedupReducer.class);
    FileSystem fs = tempDir.getFileSystem(getConf());
    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = NutchJob.getJobFailureLogMessage("Crawl", job);
            LOG.error(message);
            fs.delete(tempDir, true);
            throw new RuntimeException(message);
        }
        CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus");
        if (g != null) {
            Counter counter = g.findCounter("Documents marked as duplicate");
            long dups = counter.getValue();
            LOG.info("Deduplication: " + (int) dups + " documents marked as duplicates");
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("DeduplicationJob: " + StringUtils.stringifyException(e));
        fs.delete(tempDir, true);
        return -1;
    }
    // merge with existing crawl db
    LOG.info("Deduplication: Updating status of duplicate urls into crawl db.");
    Job mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(StatusUpdateReducer.class);
    mergeJob.setJarByClass(DeduplicationJob.class);
    fs = crawlDb.getFileSystem(getConf());
    Path outPath = FileOutputFormat.getOutputPath(job);
    Path lock = CrawlDb.lock(getConf(), crawlDb, false);
    try {
        boolean success = mergeJob.waitForCompletion(true);
        if (!success) {
            String message = NutchJob.getJobFailureLogMessage("Crawl", mergeJob);
            LOG.error(message);
            fs.delete(tempDir, true);
            NutchJob.cleanupAfterFailure(outPath, lock, fs);
            throw new RuntimeException(message);
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("DeduplicationMergeJob: " + StringUtils.stringifyException(e));
        fs.delete(tempDir, true);
        NutchJob.cleanupAfterFailure(outPath, lock, fs);
        return -1;
    }
    CrawlDb.install(mergeJob, crawlDb);
    // clean up
    fs.delete(tempDir, true);
    long end = System.currentTimeMillis();
    LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) CounterGroup(org.apache.hadoop.mapreduce.CounterGroup) IOException(java.io.IOException) Counter(org.apache.hadoop.mapreduce.Counter) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) NutchJob(org.apache.nutch.util.NutchJob) Job(org.apache.hadoop.mapreduce.Job) SimpleDateFormat(java.text.SimpleDateFormat)

Aggregations

Counter (org.apache.hadoop.mapreduce.Counter)52 Configuration (org.apache.hadoop.conf.Configuration)16 CounterGroup (org.apache.hadoop.mapreduce.CounterGroup)13 Job (org.apache.hadoop.mapreduce.Job)13 Counters (org.apache.hadoop.mapreduce.Counters)11 IOException (java.io.IOException)10 Path (org.apache.hadoop.fs.Path)8 FileSystem (org.apache.hadoop.fs.FileSystem)5 Map (java.util.Map)4 Test (org.junit.Test)4 SimpleDateFormat (java.text.SimpleDateFormat)3 ArrayList (java.util.ArrayList)3 TaskCounter (org.apache.hadoop.mapreduce.TaskCounter)3 FileNotFoundException (java.io.FileNotFoundException)2 ExecutionException (java.util.concurrent.ExecutionException)2 RejectedExecutionException (java.util.concurrent.RejectedExecutionException)2 TimeoutException (java.util.concurrent.TimeoutException)2 Schema (org.apache.avro.Schema)2 CustomOutputCommitter (org.apache.hadoop.CustomOutputCommitter)2 BytesWritable (org.apache.hadoop.io.BytesWritable)2