Search in sources :

Example 1 with HTMLMetaTags

use of org.apache.nutch.parse.HTMLMetaTags in project nutch by apache.

the class HtmlParser method getParse.

@Override
public ParseResult getParse(Content content) {
    HTMLMetaTags metaTags = new HTMLMetaTags();
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();
    // parse the content
    DocumentFragment root;
    try {
        byte[] contentInOctets = content.getContent();
        InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
        EncodingDetector detector = new EncodingDetector(conf);
        detector.autoDetectClues(content, true);
        detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
        String encoding = detector.guessEncoding(content, defaultCharEncoding);
        metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
        metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
        input.setEncoding(encoding);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Parsing...");
        }
        root = parse(input);
    } catch (IOException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (DOMException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (SAXException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) {
        LOG.error("Error: ", e);
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    // populate Nutch metadata with HTML meta directives
    metadata.addAll(metaTags.getGeneralTags());
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = utils.getBase(root);
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                baseTag = base;
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links...");
        }
        utils.getOutlinks(baseTag, l, root);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}
Also used : MalformedURLException(java.net.MalformedURLException) InputSource(org.xml.sax.InputSource) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) SAXException(org.xml.sax.SAXException) ParseStatus(org.apache.nutch.parse.ParseStatus) DOMException(org.w3c.dom.DOMException) EncodingDetector(org.apache.nutch.util.EncodingDetector) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) IOException(java.io.IOException) DOMException(org.w3c.dom.DOMException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) Map(java.util.Map)

Example 2 with HTMLMetaTags

use of org.apache.nutch.parse.HTMLMetaTags in project nutch by apache.

the class TestRobotsMetaProcessor method testRobotsMetaProcessor.

@Test
public void testRobotsMetaProcessor() {
    Configuration conf = NutchConfiguration.create();
    TikaParser parser = new TikaParser();
    parser.setConf(conf);
    try {
        currURLsAndAnswers = new URL[][] { { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org/foo/"), new URL("http://www.nutch.org/") }, { new URL("http://www.nutch.org"), new URL("http://www.nutch.org/base/") }, { new URL("http://www.nutch.org"), null } };
    } catch (Exception e) {
        Assert.assertTrue("couldn't make test URLs!", false);
    }
    for (int i = 0; i < tests.length; i++) {
        byte[] bytes = tests[i].getBytes();
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        doc.setErrorChecking(false);
        DocumentFragment root = doc.createDocumentFragment();
        String url = "http://www.nutch.org";
        Content content = new Content(url, url, bytes, "text/html", new Metadata(), conf);
        Parse parse = null;
        try {
            parse = parser.getParse(content, doc, root).get(url);
        } catch (Exception e) {
            e.printStackTrace();
        }
        HTMLMetaTags robotsMeta = new HTMLMetaTags();
        HTMLMetaProcessor.getMetaTags(robotsMeta, root, currURLsAndAnswers[i][0]);
        Assert.assertEquals("got noindex wrong on test " + i, answers[i][0], robotsMeta.getNoIndex());
        Assert.assertEquals("got nofollow wrong on test " + i, answers[i][1], robotsMeta.getNoFollow());
        Assert.assertEquals("got nocache wrong on test " + i, answers[i][2], robotsMeta.getNoCache());
        Assert.assertTrue("got base href wrong on test " + i + " (got " + robotsMeta.getBaseHref() + ")", ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) || ((robotsMeta.getBaseHref() != null) && robotsMeta.getBaseHref().equals(currURLsAndAnswers[i][1])));
        if (tests[i].contains("meta-refresh redirect")) {
            // test for NUTCH-2589
            URL metaRefreshUrl = robotsMeta.getRefreshHref();
            Assert.assertNotNull("failed to get meta-refresh redirect", metaRefreshUrl);
            Assert.assertEquals("failed to get meta-refresh redirect", "http://example.com/", metaRefreshUrl.toString());
            Assert.assertEquals("failed to add meta-refresh redirect to parse status", "http://example.com/", parse.getData().getStatus().getArgs()[0]);
        }
    }
}
Also used : HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) URL(java.net.URL) HTMLDocumentImpl(org.apache.html.dom.HTMLDocumentImpl) Content(org.apache.nutch.protocol.Content) DocumentFragment(org.w3c.dom.DocumentFragment) Test(org.junit.Test)

Example 3 with HTMLMetaTags

use of org.apache.nutch.parse.HTMLMetaTags in project nutch by apache.

the class TestRobotsMetaProcessor method testRobotsMetaProcessor.

@Test
public void testRobotsMetaProcessor() {
    DOMFragmentParser parser = new DOMFragmentParser();
    ;
    try {
        currURLsAndAnswers = new URL[][] { { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org/foo/"), new URL("http://www.nutch.org/") }, { new URL("http://www.nutch.org"), new URL("http://www.nutch.org/base/") } };
    } catch (Exception e) {
        Assert.assertTrue("couldn't make test URLs!", false);
    }
    for (int i = 0; i < tests.length; i++) {
        byte[] bytes = tests[i].getBytes();
        DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
        try {
            parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
        } catch (Exception e) {
            e.printStackTrace();
        }
        HTMLMetaTags robotsMeta = new HTMLMetaTags();
        HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
        Assert.assertTrue("got index wrong on test " + i, robotsMeta.getNoIndex() == answers[i][0]);
        Assert.assertTrue("got follow wrong on test " + i, robotsMeta.getNoFollow() == answers[i][1]);
        Assert.assertTrue("got cache wrong on test " + i, robotsMeta.getNoCache() == answers[i][2]);
        Assert.assertTrue("got base href wrong on test " + i + " (got " + robotsMeta.getBaseHref() + ")", ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) || ((robotsMeta.getBaseHref() != null) && robotsMeta.getBaseHref().equals(currURLsAndAnswers[i][1])));
    }
}
Also used : HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) ByteArrayInputStream(java.io.ByteArrayInputStream) DOMFragmentParser(org.cyberneko.html.parsers.DOMFragmentParser) URL(java.net.URL) Test(org.junit.Test)

Example 4 with HTMLMetaTags

use of org.apache.nutch.parse.HTMLMetaTags in project nutch by apache.

the class TikaParser method getParse.

ParseResult getParse(Content content, HTMLDocumentImpl doc, DocumentFragment root) {
    String mimeType = content.getContentType();
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get the right parser using the mime type as a clue
    CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser();
    Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType));
    if (parser == null) {
        String message = "Can't retrieve Tika parser for mime-type " + mimeType;
        LOG.error(message);
        return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(content.getUrl(), getConf());
    }
    LOG.debug("Using Tika parser {} for mime-type {}.", parser.getClass().getName(), mimeType);
    byte[] raw = content.getContent();
    Metadata tikamd = new Metadata();
    ContentHandler domHandler;
    // Check whether to use Tika's BoilerplateContentHandler
    if (useBoilerpipe && boilerpipeMimeTypes.contains(mimeType)) {
        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler) new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
        bpHandler.setIncludeMarkup(true);
        domHandler = (ContentHandler) bpHandler;
    } else {
        DOMBuilder domBuilder = new DOMBuilder(doc, root);
        domBuilder.setUpperCaseElementNames(upperCaseElementNames);
        domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
        domHandler = (ContentHandler) domBuilder;
    }
    LinkContentHandler linkContentHandler = new LinkContentHandler();
    ParseContext context = new ParseContext();
    if (parseEmbedded) {
        context.set(Parser.class, new AutoDetectParser(tikaConfig));
    }
    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
    if (HTMLMapper != null)
        context.set(HtmlMapper.class, HTMLMapper);
    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
    try {
        parser.parse(new ByteArrayInputStream(raw), (ContentHandler) teeContentHandler, tikamd, context);
    } catch (Exception e) {
        LOG.error("Error parsing " + content.getUrl(), e);
        return new ParseStatus(ParseStatus.FAILED, e.getMessage()).getEmptyParseResult(content.getUrl(), getConf());
    }
    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = tikamd.get("Content-Location");
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links (base URL = {}) ...", baseTag);
        }
        // pre-1233 outlink extraction
        // utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
        // Get outlinks from Tika
        List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
        utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
        if (tikaMDName.equalsIgnoreCase(TikaCoreProperties.TITLE.toString()))
            continue;
        String[] values = tikamd.getValues(tikaMDName);
        for (String v : values) {
            nutchMetadata.add(tikaMDName, v);
            if (tikaMDName.equalsIgnoreCase(Nutch.ROBOTS_METATAG) && nutchMetadata.get(Nutch.ROBOTS_METATAG) == null) {
                // NUTCH-2720 force lowercase robots directive
                nutchMetadata.add(Nutch.ROBOTS_METATAG, v);
            }
        }
    }
    if (outlinks.length == 0) {
        outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}
Also used : MalformedURLException(java.net.MalformedURLException) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.tika.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) BoilerpipeContentHandler(org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ParseStatus(org.apache.nutch.parse.ParseStatus) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) CompositeParser(org.apache.tika.parser.CompositeParser) MalformedURLException(java.net.MalformedURLException) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) HtmlMapper(org.apache.tika.parser.html.HtmlMapper) ParseContext(org.apache.tika.parser.ParseContext) ParseImpl(org.apache.nutch.parse.ParseImpl) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) Map(java.util.Map) BoilerpipeContentHandler(org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler) Link(org.apache.tika.sax.Link)

Example 5 with HTMLMetaTags

use of org.apache.nutch.parse.HTMLMetaTags in project nutch by apache.

the class TestRobotsMetaProcessor method testRobotsMetaProcessor.

@Test
public void testRobotsMetaProcessor() {
    DOMFragmentParser parser = new DOMFragmentParser();
    ;
    try {
        currURLsAndAnswers = new URL[][] { { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org"), null }, { new URL("http://www.nutch.org/foo/"), new URL("http://www.nutch.org/") }, { new URL("http://www.nutch.org"), new URL("http://www.nutch.org/base/") } };
    } catch (Exception e) {
        Assert.assertTrue("couldn't make test URLs!", false);
    }
    for (int i = 0; i < tests.length; i++) {
        byte[] bytes = tests[i].getBytes();
        DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
        try {
            parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
        } catch (Exception e) {
            e.printStackTrace();
        }
        HTMLMetaTags robotsMeta = new HTMLMetaTags();
        HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
        Assert.assertTrue("got index wrong on test " + i, robotsMeta.getNoIndex() == answers[i][0]);
        Assert.assertTrue("got follow wrong on test " + i, robotsMeta.getNoFollow() == answers[i][1]);
        Assert.assertTrue("got cache wrong on test " + i, robotsMeta.getNoCache() == answers[i][2]);
        Assert.assertTrue("got base href wrong on test " + i + " (got " + robotsMeta.getBaseHref() + ")", ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null)) || ((robotsMeta.getBaseHref() != null) && robotsMeta.getBaseHref().equals(currURLsAndAnswers[i][1])));
    }
}
Also used : HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) ByteArrayInputStream(java.io.ByteArrayInputStream) URL(java.net.URL) Test(org.junit.Test)

Aggregations

URL (java.net.URL)5 HTMLMetaTags (org.apache.nutch.parse.HTMLMetaTags)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Parse (org.apache.nutch.parse.Parse)3 Test (org.junit.Test)3 MalformedURLException (java.net.MalformedURLException)2 ArrayList (java.util.ArrayList)2 Map (java.util.Map)2 Metadata (org.apache.nutch.metadata.Metadata)2 Outlink (org.apache.nutch.parse.Outlink)2 ParseData (org.apache.nutch.parse.ParseData)2 ParseImpl (org.apache.nutch.parse.ParseImpl)2 ParseResult (org.apache.nutch.parse.ParseResult)2 ParseStatus (org.apache.nutch.parse.ParseStatus)2 DocumentFragment (org.w3c.dom.DocumentFragment)2 IOException (java.io.IOException)1 Configuration (org.apache.hadoop.conf.Configuration)1 HTMLDocumentImpl (org.apache.html.dom.HTMLDocumentImpl)1 Content (org.apache.nutch.protocol.Content)1 EncodingDetector (org.apache.nutch.util.EncodingDetector)1