Search in sources :

Example 26 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class TestLinksIndexingFilter method testIndexOnlyHostPart.

@Test
public void testIndexOnlyHostPart() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    filter.setConf(conf);
    Outlink[] outlinks = generateOutlinks(true);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
    inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    NutchField docOutlinks = doc.getField("outlinks");
    Assert.assertEquals("Only the host portion of the outlink URL must be indexed", new URL("http://www.test.com").getHost(), docOutlinks.getValues().get(0));
    Assert.assertEquals("The inlinks coming from the same host must count only once", 1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchField(org.apache.nutch.indexer.NutchField) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) URL(java.net.URL) Test(org.junit.Test)

Example 27 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class MimeTypeIndexingFilter method main.

/**
 * Main method for invoking this tool
 *
 * @throws IOException
 * @throws IndexingException
 */
public static void main(String[] args) throws IOException, IndexingException {
    Option helpOpt = new Option("h", "help", false, "show this help message");
    Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
    Options options = new Options();
    options.addOption(helpOpt).addOption(rulesOpt);
    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    String rulesFile;
    try {
        CommandLine line = parser.parse(options, args);
        if (line.hasOption("help") || !line.hasOption("rules")) {
            formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
            return;
        }
        rulesFile = line.getOptionValue("rules");
    } catch (UnrecognizedOptionException e) {
        formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
        return;
    } catch (Exception e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
        return;
    }
    MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
    Configuration conf = NutchConfiguration.create();
    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
    filter.setConf(conf);
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    String line;
    while ((line = in.readLine()) != null && !line.isEmpty()) {
        Metadata metadata = new Metadata();
        metadata.set(Response.CONTENT_TYPE, line);
        ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
        NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
        if (doc != null) {
            System.out.print("+ ");
            System.out.println(line);
        } else {
            System.out.print("- ");
            System.out.println(line);
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) InputStreamReader(java.io.InputStreamReader) NutchDocument(org.apache.nutch.indexer.NutchDocument) GnuParser(org.apache.commons.cli.GnuParser) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) IOException(java.io.IOException) IndexingException(org.apache.nutch.indexer.IndexingException) HelpFormatter(org.apache.commons.cli.HelpFormatter) ParseStatus(org.apache.nutch.parse.ParseStatus) CommandLine(org.apache.commons.cli.CommandLine) ParseData(org.apache.nutch.parse.ParseData) BufferedReader(java.io.BufferedReader) ParseImpl(org.apache.nutch.parse.ParseImpl) Option(org.apache.commons.cli.Option) CommandLineParser(org.apache.commons.cli.CommandLineParser)

Example 28 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class DOMContentUtils method getOutlinks.

/**
 * This method finds all anchors below the supplied DOM <code>node</code>, and
 * creates appropriate {@link Outlink} records for each (relative to the
 * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
 * {@link ArrayList}.
 *
 * <p>
 *
 * Links without inner structure (tags, text, etc) are discarded, as are links
 * which contain only single nested links and empty text nodes (this is a
 * common DOM-fixup artifact, at least with nekohtml).
 */
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
        Node currentNode = walker.nextNode();
        String nodeName = currentNode.getNodeName();
        short nodeType = currentNode.getNodeType();
        NodeList children = currentNode.getChildNodes();
        int childLen = (children != null) ? children.getLength() : 0;
        if (nodeType == Node.ELEMENT_NODE) {
            nodeName = nodeName.toLowerCase();
            LinkParams params = (LinkParams) linkParams.get(nodeName);
            if (params != null) {
                if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
                    StringBuffer linkText = new StringBuffer();
                    getText(linkText, currentNode, true);
                    if (linkText.toString().trim().length() == 0) {
                        // try harder - use img alt if present
                        NodeWalker subWalker = new NodeWalker(currentNode);
                        while (subWalker.hasNext()) {
                            Node subNode = subWalker.nextNode();
                            if (subNode.getNodeType() == Node.ELEMENT_NODE) {
                                if (subNode.getNodeName().toLowerCase().equals("img")) {
                                    NamedNodeMap subAttrs = subNode.getAttributes();
                                    Node alt = subAttrs.getNamedItem("alt");
                                    if (alt != null) {
                                        String altTxt = alt.getTextContent();
                                        if (altTxt != null && altTxt.trim().length() > 0) {
                                            if (linkText.length() > 0)
                                                linkText.append(' ');
                                            linkText.append(altTxt);
                                        }
                                    }
                                } else {
                                // ignore other types of elements
                                }
                            } else if (subNode.getNodeType() == Node.TEXT_NODE) {
                                String txt = subNode.getTextContent();
                                if (txt != null && txt.length() > 0) {
                                    if (linkText.length() > 0)
                                        linkText.append(' ');
                                    linkText.append(txt);
                                }
                            }
                        }
                    }
                    NamedNodeMap attrs = currentNode.getAttributes();
                    String target = null;
                    boolean noFollow = false;
                    boolean post = false;
                    for (int i = 0; i < attrs.getLength(); i++) {
                        Node attr = attrs.item(i);
                        String attrName = attr.getNodeName();
                        if (params.attrName.equalsIgnoreCase(attrName)) {
                            target = attr.getNodeValue();
                        } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
                            noFollow = true;
                        } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getNodeValue())) {
                            post = true;
                        }
                    }
                    if (target != null && !noFollow && !post)
                        try {
                            URL url = URLUtil.resolveURL(base, target);
                            Outlink outlink = new Outlink(url.toString(), linkText.toString().trim());
                            outlinks.add(outlink);
                            // the outlink metadata
                            if (keepNodenames) {
                                MapWritable metadata = new MapWritable();
                                metadata.put(new Text(srcTagMetaName), new Text(nodeName));
                                outlink.setMetadata(metadata);
                            }
                        } catch (MalformedURLException e) {
                        // don't care
                        }
                }
                // this should not have any children, skip them
                if (params.childLen == 0)
                    continue;
            }
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) MalformedURLException(java.net.MalformedURLException) NamedNodeMap(org.w3c.dom.NamedNodeMap) Node(org.w3c.dom.Node) NodeList(org.w3c.dom.NodeList) Text(org.apache.hadoop.io.Text) MapWritable(org.apache.hadoop.io.MapWritable) NodeWalker(org.apache.nutch.util.NodeWalker) URL(java.net.URL)

Example 29 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class JSParseFilter method walk.

private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) {
    if (n instanceof Element) {
        String name = n.getNodeName();
        if (name.equalsIgnoreCase("script")) {
            /*
         * String lang = null; Node lNode =
         * n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
         * "javascript"; else lang = lNode.getNodeValue();
         */
            StringBuffer script = new StringBuffer();
            NodeList nn = n.getChildNodes();
            if (nn.getLength() > 0) {
                for (int i = 0; i < nn.getLength(); i++) {
                    if (i > 0)
                        script.append('\n');
                    script.append(nn.item(i).getNodeValue());
                }
                // if (LOG.isInfoEnabled()) {
                // LOG.info("script: language=" + lang + ", text: " +
                // script.toString());
                // }
                Outlink[] links = getJSLinks(script.toString(), "", base);
                if (links != null && links.length > 0)
                    outlinks.addAll(Arrays.asList(links));
                // no other children of interest here, go one level up.
                return;
            }
        } else {
            // process all HTML 4.0 events, if present...
            NamedNodeMap attrs = n.getAttributes();
            int len = attrs.getLength();
            for (int i = 0; i < len; i++) {
                // Window: onload,onunload
                // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
                // Keyboard: onkeydown,onkeypress,onkeyup
                // Mouse:
                // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
                Node anode = attrs.item(i);
                Outlink[] links = null;
                if (anode.getNodeName().startsWith("on")) {
                    links = getJSLinks(anode.getNodeValue(), "", base);
                } else if (anode.getNodeName().equalsIgnoreCase("href")) {
                    String val = anode.getNodeValue();
                    if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
                        links = getJSLinks(val, "", base);
                    }
                }
                if (links != null && links.length > 0)
                    outlinks.addAll(Arrays.asList(links));
            }
        }
    }
    NodeList nl = n.getChildNodes();
    for (int i = 0; i < nl.getLength(); i++) {
        walk(nl.item(i), parse, metaTags, base, outlinks);
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NamedNodeMap(org.w3c.dom.NamedNodeMap) Element(org.w3c.dom.Element) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node)

Example 30 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class JSParseFilter method getJSLinks.

// Alternative pattern, which limits valid url characters.
// private static final String URI_PATTERN =
// "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
/**
 * This method extracts URLs from literals embedded in JavaScript.
 */
private Outlink[] getJSLinks(String plainText, String anchor, String base) {
    final List<Outlink> outlinks = new ArrayList<Outlink>();
    URL baseURL = null;
    try {
        baseURL = new URL(base);
    } catch (Exception e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("getJSLinks", e);
        }
    }
    try {
        final PatternCompiler cp = new Perl5Compiler();
        final Pattern pattern = cp.compile(STRING_PATTERN, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK | Perl5Compiler.MULTILINE_MASK);
        final Pattern pattern1 = cp.compile(URI_PATTERN, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK | Perl5Compiler.MULTILINE_MASK);
        final PatternMatcher matcher = new Perl5Matcher();
        final PatternMatcher matcher1 = new Perl5Matcher();
        final PatternMatcherInput input = new PatternMatcherInput(plainText);
        MatchResult result;
        String url;
        // loop the matches
        while (matcher.contains(input, pattern)) {
            result = matcher.getMatch();
            url = result.group(2);
            PatternMatcherInput input1 = new PatternMatcherInput(url);
            if (!matcher1.matches(input1, pattern1)) {
                // }
                continue;
            }
            if (url.startsWith("www.")) {
                url = "http://" + url;
            } else {
                // the next match.
                try {
                    url = new URL(baseURL, url).toString();
                } catch (MalformedURLException ex) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace(" - failed URL parse '" + url + "' and baseURL '" + baseURL + "'", ex);
                    }
                    continue;
                }
            }
            url = url.replaceAll("&amp;", "&");
            if (LOG.isTraceEnabled()) {
                LOG.trace(" - outlink from JS: '" + url + "'");
            }
            outlinks.add(new Outlink(url, anchor));
        }
    } catch (Exception ex) {
        // extraction.
        if (LOG.isErrorEnabled()) {
            LOG.error("getJSLinks", ex);
        }
    }
    final Outlink[] retval;
    // create array of the Outlinks
    if (outlinks != null && outlinks.size() > 0) {
        retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
    } else {
        retval = new Outlink[0];
    }
    return retval;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) Perl5Compiler(org.apache.oro.text.regex.Perl5Compiler) Pattern(org.apache.oro.text.regex.Pattern) PatternCompiler(org.apache.oro.text.regex.PatternCompiler) MalformedURLException(java.net.MalformedURLException) ArrayList(java.util.ArrayList) Perl5Matcher(org.apache.oro.text.regex.Perl5Matcher) MatchResult(org.apache.oro.text.regex.MatchResult) URL(java.net.URL) MalformedURLException(java.net.MalformedURLException) PatternMatcherInput(org.apache.oro.text.regex.PatternMatcherInput) PatternMatcher(org.apache.oro.text.regex.PatternMatcher)

Aggregations

Outlink (org.apache.nutch.parse.Outlink)37 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)17 ParseStatus (org.apache.nutch.parse.ParseStatus)16 URL (java.net.URL)13 Text (org.apache.hadoop.io.Text)13 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)11 Test (org.junit.Test)11 Parse (org.apache.nutch.parse.Parse)10 MalformedURLException (java.net.MalformedURLException)9 Inlinks (org.apache.nutch.crawl.Inlinks)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)9 Metadata (org.apache.nutch.metadata.Metadata)9 ArrayList (java.util.ArrayList)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 Configuration (org.apache.hadoop.conf.Configuration)6 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)6 IOException (java.io.IOException)5 ParseText (org.apache.nutch.parse.ParseText)4 Map (java.util.Map)3