Search in sources :

Example 1 with NutchWritable

use of org.apache.nutch.crawl.NutchWritable in project nutch by apache.

the class TestIndexerMapReduce method runIndexer.

/**
 * Run {@link IndexerMapReduce.reduce(...)} to get a "indexed"
 * {@link NutchDocument} by passing objects from segment and CrawlDb to the
 * indexer.
 *
 * @param dbDatum
 *          crawl datum from CrawlDb
 * @param fetchDatum
 *          crawl datum (fetch status) from segment
 * @param parseText
 *          plain text from parsed document
 * @param parseData
 *          parse data
 * @param content
 *          (optional, if index binary content) protocol content
 * @return "indexed" document
 */
public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum, ParseText parseText, ParseData parseData, Content content) {
    List<NutchWritable> values = new ArrayList<NutchWritable>();
    values.add(new NutchWritable(dbDatum));
    values.add(new NutchWritable(fetchDatum));
    values.add(new NutchWritable(parseText));
    values.add(new NutchWritable(parseData));
    values.add(new NutchWritable(content));
    reduceDriver = ReduceDriver.newReduceDriver(reducer);
    reduceDriver.getConfiguration().addResource(configuration);
    reduceDriver.withInput(testUrlText, values);
    List<Pair<Text, NutchIndexAction>> reduceResult;
    NutchDocument doc = null;
    try {
        reduceResult = reduceDriver.run();
        for (Pair<Text, NutchIndexAction> p : reduceResult) {
            if (p.getSecond().action != NutchIndexAction.DELETE) {
                doc = p.getSecond().doc;
            }
        }
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
    }
    return doc;
}
Also used : ArrayList(java.util.ArrayList) NutchWritable(org.apache.nutch.crawl.NutchWritable) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) IOException(java.io.IOException) Pair(org.apache.hadoop.mrunit.types.Pair)

Example 2 with NutchWritable

use of org.apache.nutch.crawl.NutchWritable in project nutch by apache.

the class FetcherThread method output.

private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException {
    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
        datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
    ParseResult parseResult = null;
    if (content != null) {
        Metadata metadata = content.getMetadata();
        // store the guessed content type in the crawldatum
        if (content.getContentType() != null)
            datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
            scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
            }
        }
        /*
       * Note: Fetcher will only follow meta-redirects coming from the
       * original URL.
       */
        if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
            if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
                try {
                    parseResult = this.parseUtil.parse(content);
                } catch (Exception e) {
                    LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + StringUtils.stringifyException(e));
                }
            }
            if (parseResult == null) {
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf));
                datum.setSignature(signature);
            }
        }
        /*
       * Store status code in content So we can read this value during parsing
       * (as a separate job) and decide to parse or not.
       */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
    }
    try {
        context.write(key, new NutchWritable(datum));
        if (content != null && storingContent)
            context.write(key, new NutchWritable(content));
        if (parseResult != null) {
            for (Entry<Text, Parse> entry : parseResult) {
                Text url = entry.getKey();
                Parse parse = entry.getValue();
                ParseStatus parseStatus = parse.getData().getStatus();
                ParseData parseData = parse.getData();
                if (!parseStatus.isSuccess()) {
                    LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + parseStatus);
                    parse = parseStatus.getEmptyParse(conf);
                }
                // Calculate page signature. For non-parsing fetchers this will
                // be done in ParseSegment
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
                // Ensure segment name and score are in parseData metadata
                parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
                parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
                // Pass fetch time to content meta
                parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
                if (url.equals(key))
                    datum.setSignature(signature);
                try {
                    scfilters.passScoreAfterParsing(url, content, parse);
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
                    }
                }
                String origin = null;
                // collect outlinks for subsequent db update
                Outlink[] links = parseData.getOutlinks();
                int outlinksToStore = Math.min(maxOutlinks, links.length);
                if (ignoreExternalLinks || ignoreInternalLinks) {
                    URL originURL = new URL(url.toString());
                    // based on domain?
                    if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                        origin = URLUtil.getDomainName(originURL).toLowerCase();
                    } else // use host
                    {
                        origin = originURL.getHost().toLowerCase();
                    }
                }
                // used by fetchNode
                if (fetchNode != null) {
                    fetchNode.setOutlinks(links);
                    fetchNode.setTitle(parseData.getTitle());
                    FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
                }
                int validCount = 0;
                // Process all outlinks, normalize, filter and deduplicate
                List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
                HashSet<String> outlinks = new HashSet<>(outlinksToStore);
                for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                    String toUrl = links[i].getToUrl();
                    toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
                    if (toUrl == null) {
                        continue;
                    }
                    validCount++;
                    links[i].setUrl(toUrl);
                    outlinkList.add(links[i]);
                    outlinks.add(toUrl);
                }
                // Publish fetch report event
                if (activatePublisher) {
                    FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString());
                    reportEvent.addOutlinksToEventData(outlinkList);
                    reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type"));
                    reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
                    publisher.publish(reportEvent, conf);
                }
                // Only process depth N outlinks
                if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
                    FetchItem ft = FetchItem.create(url, null, queueMode);
                    FetchItemQueue queue = ((FetchItemQueues) fetchQueues).getFetchItemQueue(ft.queueID);
                    queue.alreadyFetched.add(url.toString().hashCode());
                    context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());
                    // Counter to limit num outlinks to follow per page
                    int outlinkCounter = 0;
                    // Calculate variable number of outlinks by depth using the
                    // divisor (outlinks = Math.floor(divisor / depth * num.links))
                    int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
                    String followUrl;
                    // Walk over the outlinks and add as new FetchItem to the queues
                    Iterator<String> iter = outlinks.iterator();
                    while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
                        followUrl = iter.next();
                        // Check whether we'll follow external outlinks
                        if (outlinksIgnoreExternal) {
                            if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                                continue;
                            }
                        }
                        // Already followed?
                        int urlHashCode = followUrl.hashCode();
                        if (queue.alreadyFetched.contains(urlHashCode)) {
                            continue;
                        }
                        queue.alreadyFetched.add(urlHashCode);
                        // Create new FetchItem with depth incremented
                        FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
                        context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
                        ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                        outlinkCounter++;
                    }
                }
                // Overwrite the outlinks in ParseData with the normalized and
                // filtered set
                parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
                context.write(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
            }
        }
    } catch (IOException e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:" + e.toString());
        }
    }
    // return parse status if it exits
    if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
            context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]).increment(1);
            return p.getData().getStatus();
        }
    }
    return null;
}
Also used : Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) ParseText(org.apache.nutch.parse.ParseText) ParseStatus(org.apache.nutch.parse.ParseStatus) HashSet(java.util.HashSet) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) NutchWritable(org.apache.nutch.crawl.NutchWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) IOException(java.io.IOException) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 3 with NutchWritable

use of org.apache.nutch.crawl.NutchWritable in project nutch by apache.

the class UpdateHostDbMapper method map.

/**
 * Mapper ingesting records from the HostDB, CrawlDB and plaintext host
 * scores file. Statistics and scores are passed on.
 *
 * @param key
 * @param value
 * @param context
 */
public void map(Text key, Writable value, Context context) throws IOException, InterruptedException {
    // Get the key!
    String keyStr = key.toString();
    // Check if we process records from the CrawlDB
    if (key instanceof Text && value instanceof CrawlDatum) {
        // Get the normalized and filtered host of this URL
        buffer = filterNormalize(URLUtil.getHost(keyStr));
        // Filtered out?
        if (buffer == null) {
            context.getCounter("UpdateHostDb", "filtered_records").increment(1);
            LOG.info("UpdateHostDb: " + URLUtil.getHost(keyStr) + " crawldatum has been filtered");
            return;
        }
        // Set the host of this URL
        host.set(buffer);
        crawlDatum = (CrawlDatum) value;
        hostDatum = new HostDatum();
        // Do not resolve homepages when the root URL is unfetched
        if (crawlDatum.getStatus() != CrawlDatum.STATUS_DB_UNFETCHED) {
            // Get the protocol
            String protocol = URLUtil.getProtocol(keyStr);
            // Get the proposed homepage URL
            String homepage = protocol + "://" + buffer + "/";
            // Check if the current key is equals the host
            if (keyStr.equals(homepage)) {
                // Check if this is a redirect to the real home page
                if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
                    // Obtain the repr url for this redirect via protocolstatus from the metadata
                    ProtocolStatus z = (ProtocolStatus) crawlDatum.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
                    // Get the protocol status' arguments
                    args = z.getArgs();
                    // ..and the possible redirect URL
                    reprUrl = args[0];
                    // Am i a redirect?
                    if (reprUrl != null) {
                        LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0]);
                        context.write(host, new NutchWritable(hostDatum));
                        hostDatum.setHomepageUrl(reprUrl);
                    } else {
                        LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0] + " but has been filtered out");
                    }
                } else {
                    hostDatum.setHomepageUrl(homepage);
                    context.write(host, new NutchWritable(hostDatum));
                    LOG.info("UpdateHostDb: homepage: " + homepage);
                }
            }
        }
        // Always emit crawl datum
        context.write(host, new NutchWritable(crawlDatum));
    }
    // Check if we got a record from the hostdb
    if (key instanceof Text && value instanceof HostDatum) {
        buffer = filterNormalize(keyStr);
        // Filtered out?
        if (buffer == null) {
            context.getCounter("UpdateHostDb", "filtered_records").increment(1);
            LOG.info("UpdateHostDb: " + key.toString() + " hostdatum has been filtered");
            return;
        }
        // Get a HostDatum
        hostDatum = (HostDatum) value;
        key.set(buffer);
        // we're aggregating them from CrawlDB anyway
        if (readingCrawlDb) {
            hostDatum.resetStatistics();
        }
        context.write(key, new NutchWritable(hostDatum));
    }
    // Check if we got a record with host scores
    if (key instanceof Text && value instanceof Text) {
        buffer = filterNormalize(keyStr);
        // Filtered out?
        if (buffer == null) {
            context.getCounter("UpdateHostDb", "filtered_records").increment(1);
            LOG.info("UpdateHostDb: " + key.toString() + " score has been filtered");
            return;
        }
        key.set(buffer);
        context.write(key, new NutchWritable(new FloatWritable(Float.parseFloat(value.toString()))));
    }
}
Also used : FloatWritable(org.apache.hadoop.io.FloatWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) NutchWritable(org.apache.nutch.crawl.NutchWritable) Text(org.apache.hadoop.io.Text) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 4 with NutchWritable

use of org.apache.nutch.crawl.NutchWritable in project nutch by apache.

the class FetcherOutputFormat method getRecordWriter.

@Override
public RecordWriter<Text, NutchWritable> getRecordWriter(TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    String name = getUniqueFile(context, "part", "");
    Path dir = FileOutputFormat.getOutputPath(context);
    FileSystem fs = dir.getFileSystem(context.getConfiguration());
    Path out = FileOutputFormat.getOutputPath(context);
    final Path fetch = new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), name);
    final Path content = new Path(new Path(out, Content.DIR_NAME), name);
    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(context);
    Option fKeyClassOpt = MapFile.Writer.keyClass(Text.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option fValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option fProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
    org.apache.hadoop.io.SequenceFile.Writer.Option fCompOpt = SequenceFile.Writer.compression(compType);
    final MapFile.Writer fetchOut = new MapFile.Writer(conf, fetch, fKeyClassOpt, fValClassOpt, fCompOpt, fProgressOpt);
    return new RecordWriter<Text, NutchWritable>() {

        private MapFile.Writer contentOut;

        private RecordWriter<Text, Parse> parseOut;

        {
            if (Fetcher.isStoringContent(conf)) {
                Option cKeyClassOpt = MapFile.Writer.keyClass(Text.class);
                org.apache.hadoop.io.SequenceFile.Writer.Option cValClassOpt = SequenceFile.Writer.valueClass(Content.class);
                org.apache.hadoop.io.SequenceFile.Writer.Option cProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
                org.apache.hadoop.io.SequenceFile.Writer.Option cCompOpt = SequenceFile.Writer.compression(compType);
                contentOut = new MapFile.Writer(conf, content, cKeyClassOpt, cValClassOpt, cCompOpt, cProgressOpt);
            }
            if (Fetcher.isParsing(conf)) {
                parseOut = new ParseOutputFormat().getRecordWriter(context);
            }
        }

        public void write(Text key, NutchWritable value) throws IOException, InterruptedException {
            Writable w = value.get();
            if (w instanceof CrawlDatum)
                fetchOut.append(key, w);
            else if (w instanceof Content && contentOut != null)
                contentOut.append(key, w);
            else if (w instanceof Parse && parseOut != null)
                parseOut.write(key, (Parse) w);
        }

        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            fetchOut.close();
            if (contentOut != null) {
                contentOut.close();
            }
            if (parseOut != null) {
                parseOut.close(context);
            }
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Parse(org.apache.nutch.parse.Parse) MapFile(org.apache.hadoop.io.MapFile) NutchWritable(org.apache.nutch.crawl.NutchWritable) Writable(org.apache.hadoop.io.Writable) NutchWritable(org.apache.nutch.crawl.NutchWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) Content(org.apache.nutch.protocol.Content) FileSystem(org.apache.hadoop.fs.FileSystem) ParseOutputFormat(org.apache.nutch.parse.ParseOutputFormat) Option(org.apache.hadoop.io.MapFile.Writer.Option) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter)

Example 5 with NutchWritable

use of org.apache.nutch.crawl.NutchWritable in project nutch by apache.

the class UpdateHostDbReducer method reduce.

/**
 */
public void reduce(Text key, Iterable<NutchWritable> values, Context context) throws IOException, InterruptedException {
    Map<String, Map<String, Integer>> stringCounts = new HashMap<>();
    Map<String, Float> maximums = new HashMap<>();
    // used to calc averages
    Map<String, Float> sums = new HashMap<>();
    // used to calc averages
    Map<String, Integer> counts = new HashMap<>();
    Map<String, Float> minimums = new HashMap<>();
    Map<String, TDigest> tdigests = new HashMap<String, TDigest>();
    HostDatum hostDatum = new HostDatum();
    float score = 0;
    if (stringFields != null) {
        for (int i = 0; i < stringFields.length; i++) {
            stringCounts.put(stringFields[i], new HashMap<>());
        }
    }
    // an empty if this is a new host for the host db
    for (Writable value : values) {
        // Count crawl datum status's and collect metadata from fields
        if (value instanceof CrawlDatum) {
            CrawlDatum buffer = (CrawlDatum) value;
            // Set the correct status field
            switch(buffer.getStatus()) {
                case CrawlDatum.STATUS_DB_UNFETCHED:
                    hostDatum.setUnfetched(hostDatum.getUnfetched() + 1);
                    break;
                case CrawlDatum.STATUS_DB_FETCHED:
                    hostDatum.setFetched(hostDatum.getFetched() + 1);
                    break;
                case CrawlDatum.STATUS_DB_GONE:
                    hostDatum.setGone(hostDatum.getGone() + 1);
                    break;
                case CrawlDatum.STATUS_DB_REDIR_TEMP:
                    hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1);
                    break;
                case CrawlDatum.STATUS_DB_REDIR_PERM:
                    hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1);
                    break;
                case CrawlDatum.STATUS_DB_NOTMODIFIED:
                    hostDatum.setNotModified(hostDatum.getNotModified() + 1);
                    break;
            }
            // Record connection failures
            if (buffer.getRetriesSinceFetch() != 0) {
                hostDatum.incConnectionFailures();
            }
            // Only gather metadata statistics for proper fetched pages
            if (buffer.getStatus() == CrawlDatum.STATUS_DB_FETCHED || buffer.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
                // Deal with the string fields
                if (stringFields != null) {
                    for (int i = 0; i < stringFields.length; i++) {
                        // Does this field exist?
                        if (buffer.getMetaData().get(stringFieldWritables[i]) != null) {
                            // Get it!
                            String metadataValue = null;
                            try {
                                metadataValue = buffer.getMetaData().get(stringFieldWritables[i]).toString();
                            } catch (Exception e) {
                                LOG.error("Metadata field " + stringFields[i] + " is probably not a numeric value");
                            }
                            // Does the value exist?
                            if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
                                // Yes, increment it
                                stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue) + 1);
                            } else {
                                // Create it!
                                stringCounts.get(stringFields[i]).put(metadataValue, 1);
                            }
                        }
                    }
                }
                // Deal with the numeric fields
                if (numericFields != null) {
                    for (int i = 0; i < numericFields.length; i++) {
                        // Does this field exist?
                        if (buffer.getMetaData().get(numericFieldWritables[i]) != null) {
                            try {
                                // Get it!
                                Float metadataValue = Float.parseFloat(buffer.getMetaData().get(numericFieldWritables[i]).toString());
                                // Does the median value exist?
                                if (tdigests.containsKey(numericFields[i])) {
                                    tdigests.get(numericFields[i]).add(metadataValue);
                                } else {
                                    // Create it!
                                    TDigest tdigest = TDigest.createDigest(100);
                                    tdigest.add((double) metadataValue);
                                    tdigests.put(numericFields[i], tdigest);
                                }
                                // Does the minimum value exist?
                                if (minimums.containsKey(numericFields[i])) {
                                    // Write if this is lower than existing value
                                    if (metadataValue < minimums.get(numericFields[i])) {
                                        minimums.put(numericFields[i], metadataValue);
                                    }
                                } else {
                                    // Create it!
                                    minimums.put(numericFields[i], metadataValue);
                                }
                                // Does the maximum value exist?
                                if (maximums.containsKey(numericFields[i])) {
                                    // Write if this is lower than existing value
                                    if (metadataValue > maximums.get(numericFields[i])) {
                                        maximums.put(numericFields[i], metadataValue);
                                    }
                                } else {
                                    // Create it!
                                    maximums.put(numericFields[i], metadataValue);
                                }
                                // Sum it up!
                                if (sums.containsKey(numericFields[i])) {
                                    // Increment
                                    sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
                                    counts.put(numericFields[i], counts.get(numericFields[i]) + 1);
                                } else {
                                    // Create it!
                                    sums.put(numericFields[i], metadataValue);
                                    counts.put(numericFields[i], 1);
                                }
                            } catch (Exception e) {
                                LOG.error(e.getMessage() + " when processing values for " + key.toString());
                            }
                        }
                    }
                }
            }
        }
        // 
        if (value instanceof HostDatum) {
            HostDatum buffer = (HostDatum) value;
            // Check homepage URL
            if (buffer.hasHomepageUrl()) {
                hostDatum.setHomepageUrl(buffer.getHomepageUrl());
            }
            // Check lastCheck timestamp
            if (!buffer.isEmpty()) {
                hostDatum.setLastCheck(buffer.getLastCheck());
            }
            // Check and set DNS failures
            if (buffer.getDnsFailures() > 0) {
                hostDatum.setDnsFailures(buffer.getDnsFailures());
            }
            // Check and set connection failures
            if (buffer.getConnectionFailures() > 0) {
                hostDatum.setConnectionFailures(buffer.getConnectionFailures());
            }
            // Check metadata
            if (!buffer.getMetaData().isEmpty()) {
                hostDatum.setMetaData(buffer.getMetaData());
            }
            // Check and set score (score from Web Graph has precedence)
            if (buffer.getScore() > 0) {
                hostDatum.setScore(buffer.getScore());
            }
        }
        // Check for the score
        if (value instanceof FloatWritable) {
            FloatWritable buffer = (FloatWritable) value;
            score = buffer.get();
        }
    }
    // Check if score was set from Web Graph
    if (score > 0) {
        hostDatum.setScore(score);
    }
    // Set metadata
    for (Map.Entry<String, Map<String, Integer>> entry : stringCounts.entrySet()) {
        for (Map.Entry<String, Integer> subEntry : entry.getValue().entrySet()) {
            hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new IntWritable(subEntry.getValue()));
        }
    }
    for (Map.Entry<String, Float> entry : maximums.entrySet()) {
        hostDatum.getMetaData().put(new Text("max." + entry.getKey()), new FloatWritable(entry.getValue()));
    }
    for (Map.Entry<String, Float> entry : sums.entrySet()) {
        hostDatum.getMetaData().put(new Text("avg." + entry.getKey()), new FloatWritable(entry.getValue() / counts.get(entry.getKey())));
    }
    for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
        // Emit all percentiles
        for (int i = 0; i < percentiles.length; i++) {
            hostDatum.getMetaData().put(new Text("pct" + Integer.toString(percentiles[i]) + "." + entry.getKey()), new FloatWritable((float) entry.getValue().quantile(0.5)));
        }
    }
    for (Map.Entry<String, Float> entry : minimums.entrySet()) {
        hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue()));
    }
    context.getCounter("UpdateHostDb", "total_hosts").increment(1);
    // See if this record is to be checked
    if (shouldCheck(hostDatum)) {
        // Make an entry
        resolverThread = new ResolverThread(key.toString(), hostDatum, context, purgeFailedHostsThreshold);
        // Add the entry to the queue (blocking)
        try {
            queue.put(resolverThread);
        } catch (InterruptedException e) {
            LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
        }
        // Do not progress, the datum will be written in the resolver thread
        return;
    } else {
        context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1);
        LOG.info("UpdateHostDb: " + key.toString() + ": skipped_not_eligible");
    }
    // Write the host datum if it wasn't written by the resolver thread
    context.write(key, hostDatum);
}
Also used : HashMap(java.util.HashMap) Writable(org.apache.hadoop.io.Writable) NutchWritable(org.apache.nutch.crawl.NutchWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) IntWritable(org.apache.hadoop.io.IntWritable) IntWritable(org.apache.hadoop.io.IntWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) FloatWritable(org.apache.hadoop.io.FloatWritable) TDigest(com.tdunning.math.stats.TDigest) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

Text (org.apache.hadoop.io.Text)5 NutchWritable (org.apache.nutch.crawl.NutchWritable)5 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)2 FloatWritable (org.apache.hadoop.io.FloatWritable)2 Writable (org.apache.hadoop.io.Writable)2 Parse (org.apache.nutch.parse.Parse)2 ParseText (org.apache.nutch.parse.ParseText)2 TDigest (com.tdunning.math.stats.TDigest)1 MalformedURLException (java.net.MalformedURLException)1 URL (java.net.URL)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 IntWritable (org.apache.hadoop.io.IntWritable)1 MapFile (org.apache.hadoop.io.MapFile)1