Search in sources :

Example 1 with EncodingDetector

use of org.apache.nutch.util.EncodingDetector in project nutch by apache.

the class FeedParser method getParse.

/**
 * Parses the given feed and extracts out and parsers all linked items within
 * the feed, using the underlying ROME feed parsing library.
 *
 * @param content
 *          A {@link Content} object representing the feed that is being
 *          parsed by this {@link Parser}.
 *
 * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
 *         present in the feed file that this {@link Parser} dealt with.
 */
public ParseResult getParse(Content content) {
    SyndFeed feed = null;
    ParseResult parseResult = new ParseResult(content.getUrl());
    EncodingDetector detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, defaultEncoding);
    try {
        InputSource input = new InputSource(new ByteArrayInputStream(content.getContent()));
        input.setEncoding(encoding);
        SyndFeedInput feedInput = new SyndFeedInput();
        feed = feedInput.build(input);
    } catch (Exception e) {
        // return empty parse
        LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: " + StringUtils.stringifyException(e));
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    String feedLink = feed.getLink();
    try {
        feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
        if (feedLink != null)
            feedLink = filters.filter(feedLink);
    } catch (Exception e) {
        feedLink = null;
    }
    List<?> entries = feed.getEntries();
    for (Object entry : entries) {
        addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
    }
    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());
    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], content.getMetadata()));
    return parseResult;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) InputSource(org.xml.sax.InputSource) ParseResult(org.apache.nutch.parse.ParseResult) ParseText(org.apache.nutch.parse.ParseText) SyndFeed(com.rometools.rome.feed.synd.SyndFeed) ParseStatus(org.apache.nutch.parse.ParseStatus) EncodingDetector(org.apache.nutch.util.EncodingDetector) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) SyndFeedInput(com.rometools.rome.io.SyndFeedInput)

Example 2 with EncodingDetector

use of org.apache.nutch.util.EncodingDetector in project nutch by apache.

the class HtmlParser method getParse.

public ParseResult getParse(Content content) {
    HTMLMetaTags metaTags = new HTMLMetaTags();
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();
    // parse the content
    DocumentFragment root;
    try {
        byte[] contentInOctets = content.getContent();
        InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
        EncodingDetector detector = new EncodingDetector(conf);
        detector.autoDetectClues(content, true);
        detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
        String encoding = detector.guessEncoding(content, defaultCharEncoding);
        metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
        metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
        input.setEncoding(encoding);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Parsing...");
        }
        root = parse(input);
    } catch (IOException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (DOMException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (SAXException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) {
        LOG.error("Error: ", e);
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    // populate Nutch metadata with HTML meta directives
    metadata.addAll(metaTags.getGeneralTags());
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = utils.getBase(root);
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                baseTag = base;
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links...");
        }
        utils.getOutlinks(baseTag, l, root);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}
Also used : MalformedURLException(java.net.MalformedURLException) InputSource(org.xml.sax.InputSource) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) SAXException(org.xml.sax.SAXException) ParseStatus(org.apache.nutch.parse.ParseStatus) DOMException(org.w3c.dom.DOMException) EncodingDetector(org.apache.nutch.util.EncodingDetector) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) IOException(java.io.IOException) DOMException(org.w3c.dom.DOMException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) Map(java.util.Map)

Aggregations

ByteArrayInputStream (java.io.ByteArrayInputStream)2 Outlink (org.apache.nutch.parse.Outlink)2 ParseData (org.apache.nutch.parse.ParseData)2 ParseResult (org.apache.nutch.parse.ParseResult)2 ParseStatus (org.apache.nutch.parse.ParseStatus)2 EncodingDetector (org.apache.nutch.util.EncodingDetector)2 InputSource (org.xml.sax.InputSource)2 SyndFeed (com.rometools.rome.feed.synd.SyndFeed)1 SyndFeedInput (com.rometools.rome.io.SyndFeedInput)1 IOException (java.io.IOException)1 MalformedURLException (java.net.MalformedURLException)1 URL (java.net.URL)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 Metadata (org.apache.nutch.metadata.Metadata)1 HTMLMetaTags (org.apache.nutch.parse.HTMLMetaTags)1 Parse (org.apache.nutch.parse.Parse)1 ParseImpl (org.apache.nutch.parse.ParseImpl)1 ParseText (org.apache.nutch.parse.ParseText)1 DOMException (org.w3c.dom.DOMException)1