Search in sources :

Example 26 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestParseData method testParseData.

@Test
public void testParseData() throws Exception {
    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo"), new Outlink("http://bar.com/", "Bar") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");
    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    WritableTestUtils.testWritable(r, null);
}
Also used : Metadata(org.apache.nutch.metadata.Metadata) Test(org.junit.Test)

Example 27 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class SegmentHandler method handle.

@Override
public void handle(Request req, HttpServletResponse res, String target, int dispatch) throws IOException, ServletException {
    try {
        String uri = req.getUri().toString();
        LOG.info("URI: " + uri);
        addMyHeader(res, "URI", uri);
        Text url = new Text(uri.toString());
        CrawlDatum cd = seg.getCrawlDatum(url);
        if (cd != null) {
            addMyHeader(res, "Res", "found");
            LOG.info("-got " + cd.toString());
            ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
            if (ps != null) {
                Integer TrCode = protoCodes.get(ps.getCode());
                if (TrCode != null) {
                    res.setStatus(TrCode.intValue());
                } else {
                    res.setStatus(HttpServletResponse.SC_OK);
                }
                addMyHeader(res, "ProtocolStatus", ps.toString());
            } else {
                res.setStatus(HttpServletResponse.SC_OK);
            }
            Content c = seg.getContent(url);
            if (c == null) {
                // missing content
                req.setHandled(true);
                res.addHeader("X-Handled-By", getClass().getSimpleName());
                return;
            }
            byte[] data = c.getContent();
            LOG.debug("-data len=" + data.length);
            Metadata meta = c.getMetadata();
            String[] names = meta.names();
            LOG.debug("- " + names.length + " meta");
            for (int i = 0; i < names.length; i++) {
                boolean my = true;
                char ch = names[i].charAt(0);
                if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
                    // pretty good chance it's a standard header
                    my = false;
                }
                String[] values = meta.getValues(names[i]);
                for (int k = 0; k < values.length; k++) {
                    if (my) {
                        addMyHeader(res, names[i], values[k]);
                    } else {
                        res.addHeader(names[i], values[k]);
                    }
                }
            }
            req.setHandled(true);
            res.addHeader("X-Handled-By", getClass().getSimpleName());
            res.setContentType(meta.get(Metadata.CONTENT_TYPE));
            res.setContentLength(data.length);
            OutputStream os = res.getOutputStream();
            os.write(data, 0, data.length);
            res.flushBuffer();
        } else {
            addMyHeader(res, "Res", "not found");
            LOG.info(" -not found " + url);
        }
    } catch (Exception e) {
        e.printStackTrace();
        LOG.warn(StringUtils.stringifyException(e));
        addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
    }
}
Also used : OutputStream(java.io.OutputStream) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ServletException(javax.servlet.ServletException) IOException(java.io.IOException) Content(org.apache.nutch.protocol.Content) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 28 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class FetcherThread method output.

private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException {
    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
        datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
    ParseResult parseResult = null;
    if (content != null) {
        Metadata metadata = content.getMetadata();
        // store the guessed content type in the crawldatum
        if (content.getContentType() != null)
            datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
            scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
            }
        }
        /*
       * Note: Fetcher will only follow meta-redirects coming from the
       * original URL.
       */
        if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
            if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
                try {
                    parseResult = this.parseUtil.parse(content);
                } catch (Exception e) {
                    LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + StringUtils.stringifyException(e));
                }
            }
            if (parseResult == null) {
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf));
                datum.setSignature(signature);
            }
        }
        /*
       * Store status code in content So we can read this value during parsing
       * (as a separate job) and decide to parse or not.
       */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
    }
    try {
        context.write(key, new NutchWritable(datum));
        if (content != null && storingContent)
            context.write(key, new NutchWritable(content));
        if (parseResult != null) {
            for (Entry<Text, Parse> entry : parseResult) {
                Text url = entry.getKey();
                Parse parse = entry.getValue();
                ParseStatus parseStatus = parse.getData().getStatus();
                ParseData parseData = parse.getData();
                if (!parseStatus.isSuccess()) {
                    LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + parseStatus);
                    parse = parseStatus.getEmptyParse(conf);
                }
                // Calculate page signature. For non-parsing fetchers this will
                // be done in ParseSegment
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
                // Ensure segment name and score are in parseData metadata
                parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
                parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
                // Pass fetch time to content meta
                parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
                if (url.equals(key))
                    datum.setSignature(signature);
                try {
                    scfilters.passScoreAfterParsing(url, content, parse);
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
                    }
                }
                String origin = null;
                // collect outlinks for subsequent db update
                Outlink[] links = parseData.getOutlinks();
                int outlinksToStore = Math.min(maxOutlinks, links.length);
                if (ignoreExternalLinks || ignoreInternalLinks) {
                    URL originURL = new URL(url.toString());
                    // based on domain?
                    if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                        origin = URLUtil.getDomainName(originURL).toLowerCase();
                    } else // use host
                    {
                        origin = originURL.getHost().toLowerCase();
                    }
                }
                // used by fetchNode
                if (fetchNode != null) {
                    fetchNode.setOutlinks(links);
                    fetchNode.setTitle(parseData.getTitle());
                    FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
                }
                int validCount = 0;
                // Process all outlinks, normalize, filter and deduplicate
                List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
                HashSet<String> outlinks = new HashSet<>(outlinksToStore);
                for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                    String toUrl = links[i].getToUrl();
                    toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
                    if (toUrl == null) {
                        continue;
                    }
                    validCount++;
                    links[i].setUrl(toUrl);
                    outlinkList.add(links[i]);
                    outlinks.add(toUrl);
                }
                // Publish fetch report event
                if (activatePublisher) {
                    FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString());
                    reportEvent.addOutlinksToEventData(outlinkList);
                    reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type"));
                    reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
                    publisher.publish(reportEvent, conf);
                }
                // Only process depth N outlinks
                if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
                    FetchItem ft = FetchItem.create(url, null, queueMode);
                    FetchItemQueue queue = ((FetchItemQueues) fetchQueues).getFetchItemQueue(ft.queueID);
                    queue.alreadyFetched.add(url.toString().hashCode());
                    context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());
                    // Counter to limit num outlinks to follow per page
                    int outlinkCounter = 0;
                    // Calculate variable number of outlinks by depth using the
                    // divisor (outlinks = Math.floor(divisor / depth * num.links))
                    int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
                    String followUrl;
                    // Walk over the outlinks and add as new FetchItem to the queues
                    Iterator<String> iter = outlinks.iterator();
                    while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
                        followUrl = iter.next();
                        // Check whether we'll follow external outlinks
                        if (outlinksIgnoreExternal) {
                            if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                                continue;
                            }
                        }
                        // Already followed?
                        int urlHashCode = followUrl.hashCode();
                        if (queue.alreadyFetched.contains(urlHashCode)) {
                            continue;
                        }
                        queue.alreadyFetched.add(urlHashCode);
                        // Create new FetchItem with depth incremented
                        FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
                        context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
                        ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                        outlinkCounter++;
                    }
                }
                // Overwrite the outlinks in ParseData with the normalized and
                // filtered set
                parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
                context.write(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
            }
        }
    } catch (IOException e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:" + e.toString());
        }
    }
    // return parse status if it exits
    if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
            context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]).increment(1);
            return p.getData().getStatus();
        }
    }
    return null;
}
Also used : Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) ParseText(org.apache.nutch.parse.ParseText) ParseStatus(org.apache.nutch.parse.ParseStatus) HashSet(java.util.HashSet) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) NutchWritable(org.apache.nutch.crawl.NutchWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) IOException(java.io.IOException) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 29 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class ParseSegment method isTruncated.

/**
 * Checks if the page's content is truncated.
 *
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not, or when
 *         it could be determined, <code>false</code>.
 */
public static boolean isTruncated(Content content) {
    byte[] contentBytes = content.getContent();
    if (contentBytes == null)
        return false;
    Metadata metadata = content.getMetadata();
    if (metadata == null)
        return false;
    String lengthStr = metadata.get(Response.CONTENT_LENGTH);
    if (lengthStr != null)
        lengthStr = lengthStr.trim();
    if (StringUtil.isEmpty(lengthStr)) {
        return false;
    }
    int inHeaderSize;
    String url = content.getUrl();
    try {
        inHeaderSize = Integer.parseInt(lengthStr);
    } catch (NumberFormatException e) {
        LOG.warn("Wrong contentlength format for " + url, e);
        return false;
    }
    int actualSize = contentBytes.length;
    if (inHeaderSize > actualSize) {
        LOG.info(url + " skipped. Content of size " + inHeaderSize + " was truncated to " + actualSize);
        return true;
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
    }
    return false;
}
Also used : Metadata(org.apache.nutch.metadata.Metadata)

Example 30 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestHTMLLanguageParser method getContent.

private Content getContent(String text) {
    Metadata meta = new Metadata();
    meta.add("Content-Type", "text/html");
    return new Content(URL, BASE, text.getBytes(), "text/html", meta, NutchConfiguration.create());
}
Also used : Content(org.apache.nutch.protocol.Content) Metadata(org.apache.nutch.metadata.Metadata)

Aggregations

Metadata (org.apache.nutch.metadata.Metadata)42 Configuration (org.apache.hadoop.conf.Configuration)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 ParseData (org.apache.nutch.parse.ParseData)19 Content (org.apache.nutch.protocol.Content)18 Test (org.junit.Test)17 Text (org.apache.hadoop.io.Text)16 Parse (org.apache.nutch.parse.Parse)16 ParseImpl (org.apache.nutch.parse.ParseImpl)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)14 Inlinks (org.apache.nutch.crawl.Inlinks)11 Outlink (org.apache.nutch.parse.Outlink)10 ParseStatus (org.apache.nutch.parse.ParseStatus)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)7 ParseResult (org.apache.nutch.parse.ParseResult)7 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 File (java.io.File)4 ArrayList (java.util.ArrayList)4 ParseUtil (org.apache.nutch.parse.ParseUtil)4