Search in sources :

Example 26 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class TestLinksIndexingFilter method testIndexOnlyHostPart.

public void testIndexOnlyHostPart() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    Outlink[] outlinks = generateOutlinks(true);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("", "test"));
    inlinks.add(new Inlink("", "test"));
    inlinks.add(new Inlink("", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text(""), new CrawlDatum(), inlinks);
    NutchField docOutlinks = doc.getField("outlinks");
    Assert.assertEquals("Only the host portion of the outlink URL must be indexed", new URL("").getHost(), docOutlinks.getValues().get(0));
    Assert.assertEquals("The inlinks coming from the same host must count only once", 1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", new URL("").getHost(), doc.getFieldValue("inlinks"));
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchField(org.apache.nutch.indexer.NutchField) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text( Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) URL( Test(org.junit.Test)

Example 27 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class MimeTypeIndexingFilter method main.

 * Main method for invoking this tool
 * @throws IOException
 * @throws IndexingException
public static void main(String[] args) throws IOException, IndexingException {
    Option helpOpt = new Option("h", "help", false, "show this help message");
    Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
    Options options = new Options();
    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    String rulesFile;
    try {
        CommandLine line = parser.parse(options, args);
        if (line.hasOption("help") || !line.hasOption("rules")) {
            formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
        rulesFile = line.getOptionValue("rules");
    } catch (UnrecognizedOptionException e) {
        formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
    } catch (Exception e) {
    MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
    Configuration conf = NutchConfiguration.create();
    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
    BufferedReader in = new BufferedReader(new InputStreamReader(;
    String line;
    while ((line = in.readLine()) != null && !line.isEmpty()) {
        Metadata metadata = new Metadata();
        metadata.set(Response.CONTENT_TYPE, line);
        ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
        NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text(""), new CrawlDatum(), new Inlinks());
        if (doc != null) {
            System.out.print("+ ");
        } else {
            System.out.print("- ");
Also used : Outlink(org.apache.nutch.parse.Outlink) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) InputStreamReader( NutchDocument(org.apache.nutch.indexer.NutchDocument) GnuParser(org.apache.commons.cli.GnuParser) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text( Inlinks(org.apache.nutch.crawl.Inlinks) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) IOException( IndexingException(org.apache.nutch.indexer.IndexingException) HelpFormatter(org.apache.commons.cli.HelpFormatter) ParseStatus(org.apache.nutch.parse.ParseStatus) CommandLine(org.apache.commons.cli.CommandLine) ParseData(org.apache.nutch.parse.ParseData) BufferedReader( ParseImpl(org.apache.nutch.parse.ParseImpl) Option(org.apache.commons.cli.Option) CommandLineParser(org.apache.commons.cli.CommandLineParser)

Example 28 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class DOMContentUtils method getOutlinks.

 * This method finds all anchors below the supplied DOM <code>node</code>, and
 * creates appropriate {@link Outlink} records for each (relative to the
 * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
 * {@link ArrayList}.
 * <p>
 * Links without inner structure (tags, text, etc) are discarded, as are links
 * which contain only single nested links and empty text nodes (this is a
 * common DOM-fixup artifact, at least with nekohtml).
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
        Node currentNode = walker.nextNode();
        String nodeName = currentNode.getNodeName();
        short nodeType = currentNode.getNodeType();
        NodeList children = currentNode.getChildNodes();
        int childLen = (children != null) ? children.getLength() : 0;
        if (nodeType == Node.ELEMENT_NODE) {
            nodeName = nodeName.toLowerCase();
            LinkParams params = (LinkParams) linkParams.get(nodeName);
            if (params != null) {
                if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
                    StringBuffer linkText = new StringBuffer();
                    getText(linkText, currentNode, true);
                    if (linkText.toString().trim().length() == 0) {
                        // try harder - use img alt if present
                        NodeWalker subWalker = new NodeWalker(currentNode);
                        while (subWalker.hasNext()) {
                            Node subNode = subWalker.nextNode();
                            if (subNode.getNodeType() == Node.ELEMENT_NODE) {
                                if (subNode.getNodeName().toLowerCase().equals("img")) {
                                    NamedNodeMap subAttrs = subNode.getAttributes();
                                    Node alt = subAttrs.getNamedItem("alt");
                                    if (alt != null) {
                                        String altTxt = alt.getTextContent();
                                        if (altTxt != null && altTxt.trim().length() > 0) {
                                            if (linkText.length() > 0)
                                                linkText.append(' ');
                                } else {
                                // ignore other types of elements
                            } else if (subNode.getNodeType() == Node.TEXT_NODE) {
                                String txt = subNode.getTextContent();
                                if (txt != null && txt.length() > 0) {
                                    if (linkText.length() > 0)
                                        linkText.append(' ');
                    NamedNodeMap attrs = currentNode.getAttributes();
                    String target = null;
                    boolean noFollow = false;
                    boolean post = false;
                    for (int i = 0; i < attrs.getLength(); i++) {
                        Node attr = attrs.item(i);
                        String attrName = attr.getNodeName();
                        if (params.attrName.equalsIgnoreCase(attrName)) {
                            target = attr.getNodeValue();
                        } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
                            noFollow = true;
                        } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getNodeValue())) {
                            post = true;
                    if (target != null && !noFollow && !post)
                        try {
                            URL url = URLUtil.resolveURL(base, target);
                            Outlink outlink = new Outlink(url.toString(), linkText.toString().trim());
                            // the outlink metadata
                            if (keepNodenames) {
                                MapWritable metadata = new MapWritable();
                                metadata.put(new Text(srcTagMetaName), new Text(nodeName));
                        } catch (MalformedURLException e) {
                        // don't care
                // this should not have any children, skip them
                if (params.childLen == 0)
Also used : Outlink(org.apache.nutch.parse.Outlink) MalformedURLException( NamedNodeMap(org.w3c.dom.NamedNodeMap) Node(org.w3c.dom.Node) NodeList(org.w3c.dom.NodeList) Text( MapWritable( NodeWalker(org.apache.nutch.util.NodeWalker) URL(

Example 29 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class JSParseFilter method walk.

private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) {
    if (n instanceof Element) {
        String name = n.getNodeName();
        if (name.equalsIgnoreCase("script")) {
         * String lang = null; Node lNode =
         * n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
         * "javascript"; else lang = lNode.getNodeValue();
            StringBuffer script = new StringBuffer();
            NodeList nn = n.getChildNodes();
            if (nn.getLength() > 0) {
                for (int i = 0; i < nn.getLength(); i++) {
                    if (i > 0)
                // if (LOG.isInfoEnabled()) {
                //"script: language=" + lang + ", text: " +
                // script.toString());
                // }
                Outlink[] links = getJSLinks(script.toString(), "", base);
                if (links != null && links.length > 0)
                // no other children of interest here, go one level up.
        } else {
            // process all HTML 4.0 events, if present...
            NamedNodeMap attrs = n.getAttributes();
            int len = attrs.getLength();
            for (int i = 0; i < len; i++) {
                // Window: onload,onunload
                // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
                // Keyboard: onkeydown,onkeypress,onkeyup
                // Mouse:
                // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
                Node anode = attrs.item(i);
                Outlink[] links = null;
                if (anode.getNodeName().startsWith("on")) {
                    links = getJSLinks(anode.getNodeValue(), "", base);
                } else if (anode.getNodeName().equalsIgnoreCase("href")) {
                    String val = anode.getNodeValue();
                    if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
                        links = getJSLinks(val, "", base);
                if (links != null && links.length > 0)
    NodeList nl = n.getChildNodes();
    for (int i = 0; i < nl.getLength(); i++) {
        walk(nl.item(i), parse, metaTags, base, outlinks);
Also used : Outlink(org.apache.nutch.parse.Outlink) NamedNodeMap(org.w3c.dom.NamedNodeMap) Element(org.w3c.dom.Element) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node)

Example 30 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class JSParseFilter method getJSLinks.

// Alternative pattern, which limits valid url characters.
// private static final String URI_PATTERN =
// "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
 * This method extracts URLs from literals embedded in JavaScript.
private Outlink[] getJSLinks(String plainText, String anchor, String base) {
    final List<Outlink> outlinks = new ArrayList<Outlink>();
    URL baseURL = null;
    try {
        baseURL = new URL(base);
    } catch (Exception e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("getJSLinks", e);
    try {
        final PatternCompiler cp = new Perl5Compiler();
        final Pattern pattern = cp.compile(STRING_PATTERN, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK | Perl5Compiler.MULTILINE_MASK);
        final Pattern pattern1 = cp.compile(URI_PATTERN, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK | Perl5Compiler.MULTILINE_MASK);
        final PatternMatcher matcher = new Perl5Matcher();
        final PatternMatcher matcher1 = new Perl5Matcher();
        final PatternMatcherInput input = new PatternMatcherInput(plainText);
        MatchResult result;
        String url;
        // loop the matches
        while (matcher.contains(input, pattern)) {
            result = matcher.getMatch();
            url =;
            PatternMatcherInput input1 = new PatternMatcherInput(url);
            if (!matcher1.matches(input1, pattern1)) {
                // }
            if (url.startsWith("www.")) {
                url = "http://" + url;
            } else {
                // the next match.
                try {
                    url = new URL(baseURL, url).toString();
                } catch (MalformedURLException ex) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace(" - failed URL parse '" + url + "' and baseURL '" + baseURL + "'", ex);
            url = url.replaceAll("&amp;", "&");
            if (LOG.isTraceEnabled()) {
                LOG.trace(" - outlink from JS: '" + url + "'");
            outlinks.add(new Outlink(url, anchor));
    } catch (Exception ex) {
        // extraction.
        if (LOG.isErrorEnabled()) {
            LOG.error("getJSLinks", ex);
    final Outlink[] retval;
    // create array of the Outlinks
    if (outlinks != null && outlinks.size() > 0) {
        retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
    } else {
        retval = new Outlink[0];
    return retval;
Also used : Outlink(org.apache.nutch.parse.Outlink) Perl5Compiler(org.apache.oro.text.regex.Perl5Compiler) Pattern(org.apache.oro.text.regex.Pattern) PatternCompiler(org.apache.oro.text.regex.PatternCompiler) MalformedURLException( ArrayList(java.util.ArrayList) Perl5Matcher(org.apache.oro.text.regex.Perl5Matcher) MatchResult(org.apache.oro.text.regex.MatchResult) URL( MalformedURLException( PatternMatcherInput(org.apache.oro.text.regex.PatternMatcherInput) PatternMatcher(org.apache.oro.text.regex.PatternMatcher)


Outlink (org.apache.nutch.parse.Outlink)37 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)17 ParseStatus (org.apache.nutch.parse.ParseStatus)16 URL ( Text ( CrawlDatum (org.apache.nutch.crawl.CrawlDatum)11 Test (org.junit.Test)11 Parse (org.apache.nutch.parse.Parse)10 MalformedURLException ( Inlinks (org.apache.nutch.crawl.Inlinks)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)9 Metadata (org.apache.nutch.metadata.Metadata)9 ArrayList (java.util.ArrayList)8 ByteArrayInputStream ( Configuration (org.apache.hadoop.conf.Configuration)6 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)6 IOException ( ParseText (org.apache.nutch.parse.ParseText)4 Map (java.util.Map)3