Search in sources :

Example 1 with ParseResult

use of org.apache.nutch.parse.ParseResult in project nutch by apache.

the class FeedParser method main.

/**
 * Runs a command line version of this {@link Parser}.
 *
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 *
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        System.err.println("Usage: FeedParser <feed>");
        System.exit(1);
    }
    String name = args[0];
    String url = "file:" + name;
    Configuration conf = NutchConfiguration.create();
    FeedParser parser = new FeedParser();
    parser.setConf(conf);
    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    in.close();
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes, "application/rss+xml", new Metadata(), conf));
    for (Entry<Text, Parse> entry : parseResult) {
        System.out.println("key: " + entry.getKey());
        Parse parse = entry.getValue();
        System.out.println("data: " + parse.getData());
        System.out.println("text: " + parse.getText() + "\n");
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) ParseResult(org.apache.nutch.parse.ParseResult) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) Content(org.apache.nutch.protocol.Content) SyndContent(com.rometools.rome.feed.synd.SyndContent) File(java.io.File)

Example 2 with ParseResult

use of org.apache.nutch.parse.ParseResult in project nutch by apache.

the class FeedParser method getParse.

/**
 * Parses the given feed and extracts out and parsers all linked items within
 * the feed, using the underlying ROME feed parsing library.
 *
 * @param content
 *          A {@link Content} object representing the feed that is being
 *          parsed by this {@link Parser}.
 *
 * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
 *         present in the feed file that this {@link Parser} dealt with.
 */
@Override
public ParseResult getParse(Content content) {
    SyndFeed feed = null;
    ParseResult parseResult = new ParseResult(content.getUrl());
    EncodingDetector detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, defaultEncoding);
    try {
        InputSource input = new InputSource(new ByteArrayInputStream(content.getContent()));
        input.setEncoding(encoding);
        SyndFeedInput feedInput = new SyndFeedInput();
        feed = feedInput.build(input);
    } catch (Exception e) {
        // return empty parse
        LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: " + StringUtils.stringifyException(e));
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    String feedLink = feed.getLink();
    try {
        feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
        if (feedLink != null)
            feedLink = filters.filter(feedLink);
    } catch (Exception e) {
        feedLink = null;
    }
    List<?> entries = feed.getEntries();
    for (Object entry : entries) {
        addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
    }
    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());
    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], content.getMetadata()));
    return parseResult;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) InputSource(org.xml.sax.InputSource) ParseResult(org.apache.nutch.parse.ParseResult) ParseText(org.apache.nutch.parse.ParseText) SyndFeed(com.rometools.rome.feed.synd.SyndFeed) ParseStatus(org.apache.nutch.parse.ParseStatus) EncodingDetector(org.apache.nutch.util.EncodingDetector) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) SyndFeedInput(com.rometools.rome.io.SyndFeedInput)

Example 3 with ParseResult

use of org.apache.nutch.parse.ParseResult in project nutch by apache.

the class FetcherThread method output.

private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException {
    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
        datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
    ParseResult parseResult = null;
    if (content != null) {
        Metadata metadata = content.getMetadata();
        // store the guessed content type in the crawldatum
        if (content.getContentType() != null)
            datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
            scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("{} {} Couldn't pass score, url {} ({})", getName(), Thread.currentThread().getId(), key, e);
            }
        }
        if (status == CrawlDatum.STATUS_FETCH_SUCCESS) {
            if (parsing && !(skipTruncated && ParseSegment.isTruncated(content))) {
                try {
                    parseResult = this.parseUtil.parse(content);
                } catch (Exception e) {
                    LOG.warn("{} {} Error parsing: {}: {}", getName(), Thread.currentThread().getId(), key, StringUtils.stringifyException(e));
                }
            }
            if (parseResult == null && (parsing || signatureWithoutParsing)) {
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf));
                datum.setSignature(signature);
            }
        }
        /*
       * Store status code in content So we can read this value during parsing
       * (as a separate job) and decide to parse or not.
       */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
    }
    try {
        context.write(key, new NutchWritable(datum));
        if (content != null && storingContent)
            context.write(key, new NutchWritable(content));
        if (parseResult != null) {
            for (Entry<Text, Parse> entry : parseResult) {
                Text url = entry.getKey();
                Parse parse = entry.getValue();
                ParseStatus parseStatus = parse.getData().getStatus();
                ParseData parseData = parse.getData();
                if (!parseStatus.isSuccess()) {
                    LOG.warn("{} {} Error parsing: {}: {}", getName(), Thread.currentThread().getId(), key, parseStatus);
                    parse = parseStatus.getEmptyParse(conf);
                }
                // Calculate page signature. For non-parsing fetchers this will
                // be done in ParseSegment
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
                // Ensure segment name and score are in parseData metadata
                parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
                parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
                // Pass fetch time to content meta
                parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
                if (url.equals(key))
                    datum.setSignature(signature);
                try {
                    scfilters.passScoreAfterParsing(url, content, parse);
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("{} {} Couldn't pass score, url {} ({})", getName(), Thread.currentThread().getId(), key, e);
                    }
                }
                String origin = null;
                // collect outlinks for subsequent db update
                Outlink[] links = parseData.getOutlinks();
                int outlinksToStore = Math.min(maxOutlinks, links.length);
                if (ignoreExternalLinks || ignoreInternalLinks) {
                    URL originURL = new URL(url.toString());
                    // based on domain?
                    if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                        origin = URLUtil.getDomainName(originURL).toLowerCase();
                    } else // use host
                    {
                        origin = originURL.getHost().toLowerCase();
                    }
                }
                // used by fetchNode
                if (fetchNode != null) {
                    fetchNode.setOutlinks(links);
                    fetchNode.setTitle(parseData.getTitle());
                    FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
                }
                int validCount = 0;
                // Process all outlinks, normalize, filter and deduplicate
                List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
                HashSet<String> outlinks = new HashSet<>(outlinksToStore);
                for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                    String toUrl = links[i].getToUrl();
                    if (toUrl.length() > maxOutlinkLength) {
                        continue;
                    }
                    toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
                    if (toUrl == null) {
                        continue;
                    }
                    validCount++;
                    links[i].setUrl(toUrl);
                    outlinkList.add(links[i]);
                    outlinks.add(toUrl);
                }
                // Publish fetch report event
                if (activatePublisher) {
                    FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString());
                    reportEvent.addOutlinksToEventData(outlinkList);
                    reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type"));
                    reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
                    publisher.publish(reportEvent, conf);
                }
                // Only process depth N outlinks
                if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth && !fetchQueues.timelimitExceeded()) {
                    FetchItem ft = FetchItem.create(url, null, queueMode);
                    FetchItemQueue queue = fetchQueues.getFetchItemQueue(ft.queueID);
                    queue.alreadyFetched.add(url.toString().hashCode());
                    context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());
                    // Counter to limit num outlinks to follow per page
                    int outlinkCounter = 0;
                    String followUrl;
                    // Walk over the outlinks and add as new FetchItem to the queues
                    Iterator<String> iter = outlinks.iterator();
                    while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
                        followUrl = iter.next();
                        // Check whether we'll follow external outlinks
                        if (outlinksIgnoreExternal) {
                            if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                                continue;
                            }
                        }
                        // Already followed?
                        int urlHashCode = followUrl.hashCode();
                        if (queue.alreadyFetched.contains(urlHashCode)) {
                            continue;
                        }
                        queue.alreadyFetched.add(urlHashCode);
                        // Create new FetchItem with depth incremented
                        FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
                        context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
                        fetchQueues.addFetchItem(fit);
                        outlinkCounter++;
                    }
                }
                // Overwrite the outlinks in ParseData with the normalized and
                // filtered set
                parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
                context.write(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
            }
        }
    } catch (IOException e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:", e);
        }
    }
    // multiple parses) which allows Fetcher to follow meta-redirects
    if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
            context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]).increment(1);
            return p.getData().getStatus();
        }
    }
    return null;
}
Also used : Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) ParseText(org.apache.nutch.parse.ParseText) ParseStatus(org.apache.nutch.parse.ParseStatus) HashSet(java.util.HashSet) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) NutchWritable(org.apache.nutch.crawl.NutchWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) IOException(java.io.IOException) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 4 with ParseResult

use of org.apache.nutch.parse.ParseResult in project nutch by apache.

the class HtmlParser method getParse.

@Override
public ParseResult getParse(Content content) {
    HTMLMetaTags metaTags = new HTMLMetaTags();
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();
    // parse the content
    DocumentFragment root;
    try {
        byte[] contentInOctets = content.getContent();
        InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
        EncodingDetector detector = new EncodingDetector(conf);
        detector.autoDetectClues(content, true);
        detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
        String encoding = detector.guessEncoding(content, defaultCharEncoding);
        metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
        metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
        input.setEncoding(encoding);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Parsing...");
        }
        root = parse(input);
    } catch (IOException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (DOMException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (SAXException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) {
        LOG.error("Error: ", e);
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    // populate Nutch metadata with HTML meta directives
    metadata.addAll(metaTags.getGeneralTags());
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = utils.getBase(root);
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                baseTag = base;
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links...");
        }
        utils.getOutlinks(baseTag, l, root);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}
Also used : MalformedURLException(java.net.MalformedURLException) InputSource(org.xml.sax.InputSource) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) SAXException(org.xml.sax.SAXException) ParseStatus(org.apache.nutch.parse.ParseStatus) DOMException(org.w3c.dom.DOMException) EncodingDetector(org.apache.nutch.util.EncodingDetector) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) IOException(java.io.IOException) DOMException(org.w3c.dom.DOMException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) Map(java.util.Map)

Example 5 with ParseResult

use of org.apache.nutch.parse.ParseResult in project nutch by apache.

the class SmallStack method main.

/**
 * @param args arguments are: 0. Name of input SWF file.
 * @throws IOException if there is a fatal error processing the input
 * file
 */
public static void main(String[] args) throws IOException {
    FileInputStream in = new FileInputStream(args[0]);
    byte[] buf = new byte[in.available()];
    in.read(buf);
    in.close();
    SWFParser parser = new SWFParser();
    ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0], buf, "application/x-shockwave-flash", new Metadata(), NutchConfiguration.create()));
    Parse p = parseResult.get("file:" + args[0]);
    System.out.println("Parse Text:");
    System.out.println(p.getText());
    System.out.println("Parse Data:");
    System.out.println(p.getData());
}
Also used : ParseResult(org.apache.nutch.parse.ParseResult) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) FileInputStream(java.io.FileInputStream)

Aggregations

ParseResult (org.apache.nutch.parse.ParseResult)11 Parse (org.apache.nutch.parse.Parse)10 Metadata (org.apache.nutch.metadata.Metadata)7 Content (org.apache.nutch.protocol.Content)7 ParseData (org.apache.nutch.parse.ParseData)6 Configuration (org.apache.hadoop.conf.Configuration)5 ParseImpl (org.apache.nutch.parse.ParseImpl)5 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)5 Map (java.util.Map)4 Text (org.apache.hadoop.io.Text)4 Outlink (org.apache.nutch.parse.Outlink)4 ParseStatus (org.apache.nutch.parse.ParseStatus)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 FileInputStream (java.io.FileInputStream)3 MalformedURLException (java.net.MalformedURLException)3 URL (java.net.URL)3 ArrayList (java.util.ArrayList)3 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)3 ParseText (org.apache.nutch.parse.ParseText)3 File (java.io.File)2