Search in sources :

Example 11 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class HtmlParser method getParse.

public ParseResult getParse(Content content) {
    HTMLMetaTags metaTags = new HTMLMetaTags();
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();
    // parse the content
    DocumentFragment root;
    try {
        byte[] contentInOctets = content.getContent();
        InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
        EncodingDetector detector = new EncodingDetector(conf);
        detector.autoDetectClues(content, true);
        detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
        String encoding = detector.guessEncoding(content, defaultCharEncoding);
        metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
        metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
        input.setEncoding(encoding);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Parsing...");
        }
        root = parse(input);
    } catch (IOException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (DOMException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (SAXException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) {
        LOG.error("Error: ", e);
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    // populate Nutch metadata with HTML meta directives
    metadata.addAll(metaTags.getGeneralTags());
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = utils.getBase(root);
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                baseTag = base;
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links...");
        }
        utils.getOutlinks(baseTag, l, root);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}
Also used : MalformedURLException(java.net.MalformedURLException) InputSource(org.xml.sax.InputSource) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) SAXException(org.xml.sax.SAXException) ParseStatus(org.apache.nutch.parse.ParseStatus) DOMException(org.w3c.dom.DOMException) EncodingDetector(org.apache.nutch.util.EncodingDetector) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) IOException(java.io.IOException) DOMException(org.w3c.dom.DOMException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) Map(java.util.Map)

Example 12 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class TestDOMContentUtils method setup.

@Before
public void setup() {
    conf = NutchConfiguration.create();
    conf.setBoolean("parser.html.form.use_action", true);
    utils = new DOMContentUtils(conf);
    DOMFragmentParser parser = new DOMFragmentParser();
    try {
        parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", true);
    } catch (SAXException e) {
    }
    for (int i = 0; i < testPages.length; i++) {
        DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
        try {
            parser.parse(new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), node);
            testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
        } catch (Exception e) {
            Assert.assertTrue("caught exception: " + e, false);
        }
        testDOMs[i] = node;
    }
    try {
        answerOutlinks = new Outlink[][] { { new Outlink("http://www.nutch.org", "anchor") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/bot.html", "bots") }, { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/1", "1"), new Outlink("http://www.nutch.org/docs/2", "2") }, { new Outlink("http://www.nutch.org/frames/top.html", ""), new Outlink("http://www.nutch.org/frames/left.html", ""), new Outlink("http://www.nutch.org/frames/invalid.html", ""), new Outlink("http://www.nutch.org/frames/right.html", "") }, { new Outlink("http://www.nutch.org/maps/logo.gif", ""), new Outlink("http://www.nutch.org/index.html", ""), new Outlink("http://www.nutch.org/maps/#bottom", ""), new Outlink("http://www.nutch.org/bot.html", ""), new Outlink("http://www.nutch.org/docs/index.html", "") }, { new Outlink("http://www.nutch.org/index.html", "whitespace test") }, {}, { new Outlink("http://www.nutch.org/dummy.jsp", "test2") }, {}, { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, { // this is tricky - see RFC3986 section 5.4.1 example 7
        new Outlink("http://www.nutch.org/g", "anchor1"), new Outlink("http://www.nutch.org/g?y#s", "anchor2"), new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") }, { new Outlink("http://www.nutch.org/g", ""), new Outlink("http://www.nutch.org/g1", ""), new Outlink("http://www.nutch.org/g2", "bla bla"), new Outlink("http://www.nutch.org/test.gif", "bla bla") }, { new Outlink("http://www.nutch.org/movie.mp4", "") } };
    } catch (MalformedURLException e) {
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) MalformedURLException(java.net.MalformedURLException) ByteArrayInputStream(java.io.ByteArrayInputStream) URL(java.net.URL) MalformedURLException(java.net.MalformedURLException) Before(org.junit.Before)

Example 13 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class TestHtmlParser method testResolveBaseUrl.

@Test
public void testResolveBaseUrl() {
    byte[] contentBytes = resolveBaseUrlTestContent.getBytes(StandardCharsets.UTF_8);
    // parse using http://example.com/ as "fetch" URL
    Parse parse = parse(contentBytes);
    LOG.info(parse.getData().toString());
    Outlink[] outlinks = parse.getData().getOutlinks();
    Assert.assertEquals(1, outlinks.length);
    Assert.assertEquals("http://www.example.com/index.html", outlinks[0].getToUrl());
}
Also used : Outlink(org.apache.nutch.parse.Outlink) Parse(org.apache.nutch.parse.Parse) Test(org.junit.Test)

Example 14 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class JSParseFilter method getParse.

public ParseResult getParse(Content c) {
    String type = c.getContentType();
    if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
        return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT, "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
    String script = new String(c.getContent());
    Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
    if (outlinks == null)
        outlinks = new Outlink[0];
    // Title? use the first line of the script...
    String title;
    int idx = script.indexOf('\n');
    if (idx != -1) {
        if (idx > MAX_TITLE_LEN)
            idx = MAX_TITLE_LEN;
        title = script.substring(0, idx);
    } else {
        idx = Math.min(MAX_TITLE_LEN, script.length());
        title = script.substring(0, idx);
    }
    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, c.getMetadata());
    return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 15 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class JSParseFilter method filter.

public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    Parse parse = parseResult.get(content.getUrl());
    String url = content.getBaseUrl();
    ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
    walk(doc, parse, metaTags, url, outlinks);
    if (outlinks.size() > 0) {
        Outlink[] old = parse.getData().getOutlinks();
        String title = parse.getData().getTitle();
        List<Outlink> list = Arrays.asList(old);
        outlinks.addAll(list);
        ParseStatus status = parse.getData().getStatus();
        String text = parse.getText();
        Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
        ParseData parseData = new ParseData(status, title, newlinks, parse.getData().getContentMeta(), parse.getData().getParseMeta());
        // replace original parse obj with new one
        parseResult.put(content.getUrl(), new ParseText(text), parseData);
    }
    return parseResult;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) ParseData(org.apache.nutch.parse.ParseData) Parse(org.apache.nutch.parse.Parse) ArrayList(java.util.ArrayList) ParseText(org.apache.nutch.parse.ParseText)

Aggregations

Outlink (org.apache.nutch.parse.Outlink)37 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)17 ParseStatus (org.apache.nutch.parse.ParseStatus)16 URL (java.net.URL)13 Text (org.apache.hadoop.io.Text)13 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)11 Test (org.junit.Test)11 Parse (org.apache.nutch.parse.Parse)10 MalformedURLException (java.net.MalformedURLException)9 Inlinks (org.apache.nutch.crawl.Inlinks)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)9 Metadata (org.apache.nutch.metadata.Metadata)9 ArrayList (java.util.ArrayList)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 Configuration (org.apache.hadoop.conf.Configuration)6 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)6 IOException (java.io.IOException)5 ParseText (org.apache.nutch.parse.ParseText)4 Map (java.util.Map)3