Search in sources :

Example 6 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class LinksIndexingFilter method filter.

@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
    // Add the outlinks
    Outlink[] outlinks = parse.getData().getOutlinks();
    if (outlinks != null) {
        Set<String> hosts = new HashSet<String>();
        for (Outlink outlink : outlinks) {
            try {
                String linkUrl = outlink.getToUrl();
                String outHost = new URL(linkUrl).getHost().toLowerCase();
                if (indexHost) {
                    linkUrl = outHost;
                    if (hosts.contains(linkUrl))
                        continue;
                    hosts.add(linkUrl);
                }
                addFilteredLink("outlinks", url.toString(), linkUrl, outHost, filterOutlinks, doc);
            } catch (MalformedURLException e) {
                LOG.error("Malformed URL in {}: {}", url, e.getMessage());
            }
        }
    }
    // Add the inlinks
    if (null != inlinks) {
        Iterator<Inlink> iterator = inlinks.iterator();
        Set<String> inlinkHosts = new HashSet<String>();
        while (iterator.hasNext()) {
            try {
                Inlink link = iterator.next();
                String linkUrl = link.getFromUrl();
                String inHost = new URL(linkUrl).getHost().toLowerCase();
                if (indexHost) {
                    linkUrl = inHost;
                    if (inlinkHosts.contains(linkUrl))
                        continue;
                    inlinkHosts.add(linkUrl);
                }
                addFilteredLink("inlinks", url.toString(), linkUrl, inHost, filterInlinks, doc);
            } catch (MalformedURLException e) {
                LOG.error("Malformed URL in {}: {}", url, e.getMessage());
            }
        }
    }
    return doc;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) MalformedURLException(java.net.MalformedURLException) Inlink(org.apache.nutch.crawl.Inlink) URL(java.net.URL) HashSet(java.util.HashSet)

Example 7 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class TestLinksIndexingFilter method testIndexHostsOnlyAndFilterOutlinks.

@Test
public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
    conf = NutchConfiguration.create();
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    Outlink[] outlinks = generateOutlinks(true);
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
    Assert.assertEquals("Index only the host portion of the outlinks after filtering", new URL("http://www.test.com").getHost(), doc.getFieldValue("outlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) URL(java.net.URL) Test(org.junit.Test)

Example 8 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class TestMoreIndexingFilter method testContentDispositionTitle.

@Test
public void testContentDispositionTitle() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    Text url = new Text("http://www.example.com/");
    ParseImpl parseImpl = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
    NutchDocument doc = new NutchDocument();
    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
    Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
    /* NUTCH-1140: do not add second title to avoid a multi-valued title field */
    doc = new NutchDocument();
    doc.add("title", "title");
    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
    Assert.assertEquals("do not add second title by content-disposition", "title", doc.getFieldValue("title"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) NutchDocument(org.apache.nutch.indexer.NutchDocument) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 9 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class MimeTypeIndexingFilterTest method setUp.

@Before
public void setUp() throws Exception {
    for (int i = 0; i < MIME_TYPES.length; i++) {
        Metadata metadata = new Metadata();
        metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
        ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
        parses[i] = parse;
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) Before(org.junit.Before)

Example 10 with Outlink

use of org.apache.nutch.parse.Outlink in project nutch by apache.

the class ExtParser method getParse.

public ParseResult getParse(Content content) {
    String contentType = content.getContentType();
    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
    if (params == null)
        return new ParseStatus(ParseStatus.FAILED, "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());
    String command = params[0];
    int timeout = Integer.parseInt(params[1]);
    String encoding = params[2];
    if (LOG.isTraceEnabled()) {
        LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
    }
    String text = null;
    String title = null;
    try {
        byte[] raw = content.getContent();
        String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
        if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete " + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
        }
        ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
        ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
        CommandRunner cr = new CommandRunner();
        cr.setCommand(command + " " + contentType);
        cr.setInputStream(new ByteArrayInputStream(raw));
        cr.setStdOutputStream(os);
        cr.setStdErrorStream(es);
        cr.setTimeout(timeout);
        cr.evaluate();
        if (cr.getExitValue() != 0)
            return new ParseStatus(ParseStatus.FAILED, "External command " + command + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());
        text = os.toString(encoding);
    } catch (Exception e) {
        // run time exception
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    if (text == null)
        text = "";
    if (title == null)
        title = "";
    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ParseStatus(org.apache.nutch.parse.ParseStatus) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CommandRunner(org.apache.nutch.util.CommandRunner)

Aggregations

Outlink (org.apache.nutch.parse.Outlink)37 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)17 ParseStatus (org.apache.nutch.parse.ParseStatus)16 URL (java.net.URL)13 Text (org.apache.hadoop.io.Text)13 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)11 Test (org.junit.Test)11 Parse (org.apache.nutch.parse.Parse)10 MalformedURLException (java.net.MalformedURLException)9 Inlinks (org.apache.nutch.crawl.Inlinks)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)9 Metadata (org.apache.nutch.metadata.Metadata)9 ArrayList (java.util.ArrayList)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 Configuration (org.apache.hadoop.conf.Configuration)6 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)6 IOException (java.io.IOException)5 ParseText (org.apache.nutch.parse.ParseText)4 Map (java.util.Map)3