Search in sources :

Example 21 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestRegexParseFilter method testPositiveFilter.

public void testPositiveFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
    RegexParseFilter filter = new RegexParseFilter(file);
    filter.setConf(conf);
    String url = "http://nutch.apache.org/";
    String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
    Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
    ParseResult result = ParseResult.createParseResult(url, parse);
    result = filter.filter(content, result, null, null);
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("true", meta.get("first"));
    assertEquals("true", meta.get("second"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseResult(org.apache.nutch.parse.ParseResult) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 22 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestRegexParseFilter method testNegativeFilter.

public void testNegativeFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
    RegexParseFilter filter = new RegexParseFilter(file);
    filter.setConf(conf);
    String url = "http://nutch.apache.org/";
    String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
    Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
    ParseResult result = ParseResult.createParseResult(url, parse);
    result = filter.filter(content, result, null, null);
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("false", meta.get("first"));
    assertEquals("false", meta.get("second"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseResult(org.apache.nutch.parse.ParseResult) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 23 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestFetcher method testFetch.

@Test
public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
    // generate seedlist
    ArrayList<String> urls = new ArrayList<String>();
    addUrl(urls, "index.html");
    addUrl(urls, "pagea.html");
    addUrl(urls, "pageb.html");
    addUrl(urls, "dup_of_pagea.html");
    addUrl(urls, "nested_spider_trap.html");
    addUrl(urls, "exception.html");
    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
    // inject
    Injector injector = new Injector(conf);
    injector.inject(crawldbPath, urlPath);
    // generate
    Generator g = new Generator(conf);
    Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE, false, false);
    long time = System.currentTimeMillis();
    // fetch
    Fetcher fetcher = new Fetcher(conf);
    // Set fetcher.parse to true
    conf.setBoolean("fetcher.parse", true);
    fetcher.fetch(generatedSegment[0], 1);
    time = System.currentTimeMillis() - time;
    // verify politeness, time taken should be more than (num_of_pages +1)*delay
    int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat("fetcher.server.delay", 5));
    Assert.assertTrue(time > minimumTime);
    // verify content
    Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME), "part-r-00000/data");
    @SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
    ArrayList<String> handledurls = new ArrayList<String>();
    READ_CONTENT: do {
        Text key = new Text();
        Content value = new Content();
        if (!reader.next(key, value))
            break READ_CONTENT;
        String contentString = new String(value.getContent());
        if (contentString.indexOf("Nutch fetcher test page") != -1) {
            handledurls.add(key.toString());
        }
    } while (true);
    reader.close();
    Collections.sort(urls);
    Collections.sort(handledurls);
    // verify that enough pages were handled
    Assert.assertEquals(urls.size(), handledurls.size());
    // verify that correct pages were handled
    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
    handledurls.clear();
    // verify parse data
    Path parseData = new Path(new Path(generatedSegment[0], ParseData.DIR_NAME), "part-r-00000/data");
    reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
    READ_PARSE_DATA: do {
        Text key = new Text();
        ParseData value = new ParseData();
        if (!reader.next(key, value))
            break READ_PARSE_DATA;
        // make sure they all contain "nutch.segment.name" and
        // "nutch.content.digest"
        // keys in parse metadata
        Metadata contentMeta = value.getContentMeta();
        if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
            handledurls.add(key.toString());
        }
    } while (true);
    Collections.sort(handledurls);
    Assert.assertEquals(urls.size(), handledurls.size());
    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) Metadata(org.apache.nutch.metadata.Metadata) Text(org.apache.hadoop.io.Text) SequenceFile(org.apache.hadoop.io.SequenceFile) ParseData(org.apache.nutch.parse.ParseData) Injector(org.apache.nutch.crawl.Injector) Content(org.apache.nutch.protocol.Content) Generator(org.apache.nutch.crawl.Generator) Test(org.junit.Test)

Example 24 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestIndexingFilters method testNonExistingIndexingFilter.

/**
 * Test behaviour when defined filter does not exist.
 *
 * @throws IndexingException
 */
@Test
public void testNonExistingIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 25 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class FetcherThread method output.

private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException {
    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
        datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
    ParseResult parseResult = null;
    if (content != null) {
        Metadata metadata = content.getMetadata();
        // store the guessed content type in the crawldatum
        if (content.getContentType() != null)
            datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
            scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
            }
        }
        /*
       * Note: Fetcher will only follow meta-redirects coming from the
       * original URL.
       */
        if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
            if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
                try {
                    parseResult = this.parseUtil.parse(content);
                } catch (Exception e) {
                    LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + StringUtils.stringifyException(e));
                }
            }
            if (parseResult == null) {
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf));
                datum.setSignature(signature);
            }
        }
        /*
       * Store status code in content So we can read this value during parsing
       * (as a separate job) and decide to parse or not.
       */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
    }
    try {
        context.write(key, new NutchWritable(datum));
        if (content != null && storingContent)
            context.write(key, new NutchWritable(content));
        if (parseResult != null) {
            for (Entry<Text, Parse> entry : parseResult) {
                Text url = entry.getKey();
                Parse parse = entry.getValue();
                ParseStatus parseStatus = parse.getData().getStatus();
                ParseData parseData = parse.getData();
                if (!parseStatus.isSuccess()) {
                    LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + parseStatus);
                    parse = parseStatus.getEmptyParse(conf);
                }
                // Calculate page signature. For non-parsing fetchers this will
                // be done in ParseSegment
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
                // Ensure segment name and score are in parseData metadata
                parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
                parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
                // Pass fetch time to content meta
                parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
                if (url.equals(key))
                    datum.setSignature(signature);
                try {
                    scfilters.passScoreAfterParsing(url, content, parse);
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
                    }
                }
                String origin = null;
                // collect outlinks for subsequent db update
                Outlink[] links = parseData.getOutlinks();
                int outlinksToStore = Math.min(maxOutlinks, links.length);
                if (ignoreExternalLinks || ignoreInternalLinks) {
                    URL originURL = new URL(url.toString());
                    // based on domain?
                    if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                        origin = URLUtil.getDomainName(originURL).toLowerCase();
                    } else // use host
                    {
                        origin = originURL.getHost().toLowerCase();
                    }
                }
                // used by fetchNode
                if (fetchNode != null) {
                    fetchNode.setOutlinks(links);
                    fetchNode.setTitle(parseData.getTitle());
                    FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
                }
                int validCount = 0;
                // Process all outlinks, normalize, filter and deduplicate
                List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
                HashSet<String> outlinks = new HashSet<>(outlinksToStore);
                for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                    String toUrl = links[i].getToUrl();
                    toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
                    if (toUrl == null) {
                        continue;
                    }
                    validCount++;
                    links[i].setUrl(toUrl);
                    outlinkList.add(links[i]);
                    outlinks.add(toUrl);
                }
                // Publish fetch report event
                if (activatePublisher) {
                    FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString());
                    reportEvent.addOutlinksToEventData(outlinkList);
                    reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type"));
                    reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
                    publisher.publish(reportEvent, conf);
                }
                // Only process depth N outlinks
                if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
                    FetchItem ft = FetchItem.create(url, null, queueMode);
                    FetchItemQueue queue = ((FetchItemQueues) fetchQueues).getFetchItemQueue(ft.queueID);
                    queue.alreadyFetched.add(url.toString().hashCode());
                    context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());
                    // Counter to limit num outlinks to follow per page
                    int outlinkCounter = 0;
                    // Calculate variable number of outlinks by depth using the
                    // divisor (outlinks = Math.floor(divisor / depth * num.links))
                    int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
                    String followUrl;
                    // Walk over the outlinks and add as new FetchItem to the queues
                    Iterator<String> iter = outlinks.iterator();
                    while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
                        followUrl = iter.next();
                        // Check whether we'll follow external outlinks
                        if (outlinksIgnoreExternal) {
                            if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                                continue;
                            }
                        }
                        // Already followed?
                        int urlHashCode = followUrl.hashCode();
                        if (queue.alreadyFetched.contains(urlHashCode)) {
                            continue;
                        }
                        queue.alreadyFetched.add(urlHashCode);
                        // Create new FetchItem with depth incremented
                        FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
                        context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
                        ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                        outlinkCounter++;
                    }
                }
                // Overwrite the outlinks in ParseData with the normalized and
                // filtered set
                parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
                context.write(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
            }
        }
    } catch (IOException e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:" + e.toString());
        }
    }
    // return parse status if it exits
    if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
            context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]).increment(1);
            return p.getData().getStatus();
        }
    }
    return null;
}
Also used : Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) ParseText(org.apache.nutch.parse.ParseText) ParseStatus(org.apache.nutch.parse.ParseStatus) HashSet(java.util.HashSet) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) NutchWritable(org.apache.nutch.crawl.NutchWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) IOException(java.io.IOException) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Aggregations

ParseData (org.apache.nutch.parse.ParseData)37 ParseImpl (org.apache.nutch.parse.ParseImpl)29 Text (org.apache.hadoop.io.Text)23 ParseStatus (org.apache.nutch.parse.ParseStatus)23 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)22 Outlink (org.apache.nutch.parse.Outlink)22 Inlinks (org.apache.nutch.crawl.Inlinks)19 Metadata (org.apache.nutch.metadata.Metadata)19 Test (org.junit.Test)19 NutchDocument (org.apache.nutch.indexer.NutchDocument)16 Configuration (org.apache.hadoop.conf.Configuration)14 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)14 Parse (org.apache.nutch.parse.Parse)9 URL (java.net.URL)7 ArrayList (java.util.ArrayList)6 ParseResult (org.apache.nutch.parse.ParseResult)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 IOException (java.io.IOException)5 Inlink (org.apache.nutch.crawl.Inlink)5 Content (org.apache.nutch.protocol.Content)5