Search in sources :

Example 16 with CounterGroup

use of org.apache.hadoop.mapreduce.CounterGroup in project nutch by apache.

the class DeduplicationJob method run.

@Override
public int run(String[] args) throws IOException {
    if (args.length < 1) {
        System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<httpsOverHttp>,<urlLength>]");
        return 1;
    }
    String group = "none";
    Path crawlDb = new Path(args[0]);
    String compareOrder = "score,fetchTime,urlLength";
    for (int i = 1; i < args.length; i++) {
        if (args[i].equals("-group"))
            group = args[++i];
        if (args[i].equals("-compareOrder")) {
            compareOrder = args[++i];
            if (compareOrder.indexOf("score") == -1 || compareOrder.indexOf("fetchTime") == -1 || compareOrder.indexOf("urlLength") == -1) {
                System.err.println("DeduplicationJob: compareOrder must contain score, fetchTime and urlLength.");
                return 1;
            }
        }
    }
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("DeduplicationJob: starting at " + sdf.format(start));
    Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    Job job = NutchJob.getInstance(getConf());
    Configuration conf = job.getConfiguration();
    job.setJobName("Deduplication on " + crawlDb);
    conf.set(DEDUPLICATION_GROUP_MODE, group);
    conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
    job.setJarByClass(DeduplicationJob.class);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(CrawlDatum.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setMapperClass(DBFilter.class);
    job.setReducerClass(DedupReducer.class);
    FileSystem fs = tempDir.getFileSystem(getConf());
    try {
        boolean success = job.waitForCompletion(true);
        if (!success) {
            String message = NutchJob.getJobFailureLogMessage("Crawl", job);
            LOG.error(message);
            fs.delete(tempDir, true);
            throw new RuntimeException(message);
        }
        CounterGroup g = job.getCounters().getGroup("DeduplicationJobStatus");
        if (g != null) {
            Counter counter = g.findCounter("Documents marked as duplicate");
            long dups = counter.getValue();
            LOG.info("Deduplication: " + (int) dups + " documents marked as duplicates");
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("DeduplicationJob: " + StringUtils.stringifyException(e));
        fs.delete(tempDir, true);
        return -1;
    }
    // merge with existing crawl db
    LOG.info("Deduplication: Updating status of duplicate urls into crawl db.");
    Job mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(StatusUpdateReducer.class);
    mergeJob.setJarByClass(DeduplicationJob.class);
    fs = crawlDb.getFileSystem(getConf());
    Path outPath = FileOutputFormat.getOutputPath(job);
    Path lock = CrawlDb.lock(getConf(), crawlDb, false);
    try {
        boolean success = mergeJob.waitForCompletion(true);
        if (!success) {
            String message = NutchJob.getJobFailureLogMessage("Crawl", mergeJob);
            LOG.error(message);
            fs.delete(tempDir, true);
            NutchJob.cleanupAfterFailure(outPath, lock, fs);
            throw new RuntimeException(message);
        }
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
        LOG.error("DeduplicationMergeJob: " + StringUtils.stringifyException(e));
        fs.delete(tempDir, true);
        NutchJob.cleanupAfterFailure(outPath, lock, fs);
        return -1;
    }
    CrawlDb.install(mergeJob, crawlDb);
    // clean up
    fs.delete(tempDir, true);
    long end = System.currentTimeMillis();
    LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) CounterGroup(org.apache.hadoop.mapreduce.CounterGroup) IOException(java.io.IOException) Counter(org.apache.hadoop.mapreduce.Counter) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) NutchJob(org.apache.nutch.util.NutchJob) Job(org.apache.hadoop.mapreduce.Job) SimpleDateFormat(java.text.SimpleDateFormat)

Aggregations

CounterGroup (org.apache.hadoop.mapreduce.CounterGroup)16 Counter (org.apache.hadoop.mapreduce.Counter)13 Configuration (org.apache.hadoop.conf.Configuration)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Path (org.apache.hadoop.fs.Path)3 Counters (org.apache.hadoop.mapreduce.Counters)3 Job (org.apache.hadoop.mapreduce.Job)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 BufferedReader (java.io.BufferedReader)2 InputStreamReader (java.io.InputStreamReader)2 Map (java.util.Map)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 JSONArray (org.codehaus.jettison.json.JSONArray)2 JSONObject (org.codehaus.jettison.json.JSONObject)2 JsonNode (com.fasterxml.jackson.databind.JsonNode)1 ArrayNode (com.fasterxml.jackson.databind.node.ArrayNode)1 ObjectNode (com.fasterxml.jackson.databind.node.ObjectNode)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 IOException (java.io.IOException)1