Search in sources :

Example 6 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class FeedIndexingFilter method filter.

/**
 * Extracts out the relevant fields:
 *
 * <ul>
 * <li>FEED_AUTHOR</li>
 * <li>FEED_TAGS</li>
 * <li>FEED_PUBLISHED</li>
 * <li>FEED_UPDATED</li>
 * <li>FEED</li>
 * </ul>
 *
 * And sends them to the {@link org.apache.nutch.indexer Indexer} for indexing within the Nutch index.
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
    ParseData parseData = parse.getData();
    Metadata parseMeta = parseData.getParseMeta();
    String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
    String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
    String published = parseMeta.get(Feed.FEED_PUBLISHED);
    String updated = parseMeta.get(Feed.FEED_UPDATED);
    String feed = parseMeta.get(Feed.FEED);
    if (authors != null) {
        for (String author : authors) {
            doc.add(Feed.FEED_AUTHOR, author);
        }
    }
    if (tags != null) {
        for (String tag : tags) {
            doc.add(Feed.FEED_TAGS, tag);
        }
    }
    if (feed != null)
        doc.add(Feed.FEED, feed);
    if (published != null) {
        Date date = new Date(Long.parseLong(published));
        doc.add(PUBLISHED_DATE, date);
    }
    if (updated != null) {
        Date date = new Date(Long.parseLong(updated));
        doc.add(UPDATED_DATE, date);
    }
    return doc;
}
Also used : ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) Date(java.util.Date)

Example 7 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class FeedParser method main.

/**
 * Runs a command line version of this {@link Parser}.
 *
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 *
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        System.err.println("Usage: FeedParser <feed>");
        System.exit(1);
    }
    String name = args[0];
    String url = "file:" + name;
    Configuration conf = NutchConfiguration.create();
    FeedParser parser = new FeedParser();
    parser.setConf(conf);
    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    in.close();
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes, "application/rss+xml", new Metadata(), conf));
    for (Entry<Text, Parse> entry : parseResult) {
        System.out.println("key: " + entry.getKey());
        Parse parse = entry.getValue();
        System.out.println("data: " + parse.getData());
        System.out.println("text: " + parse.getText() + "\n");
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) ParseResult(org.apache.nutch.parse.ParseResult) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) Content(org.apache.nutch.protocol.Content) SyndContent(com.rometools.rome.feed.synd.SyndContent) File(java.io.File)

Example 8 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class FeedParser method addToMap.

private void addToMap(ParseResult parseResult, SyndFeed feed, String feedLink, SyndEntry entry, Content content) {
    String link = entry.getLink(), text = null, title = null;
    Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
    Parse parse = null;
    SyndContent description = entry.getDescription();
    try {
        link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
        if (link != null)
            link = filters.filter(link);
    } catch (Exception e) {
        e.printStackTrace();
        return;
    }
    if (link == null)
        return;
    title = stripTags(entry.getTitleEx());
    if (feedLink != null)
        parseMeta.set("feed", feedLink);
    addFields(parseMeta, contentMeta, feed, entry);
    // some item descriptions contain markup text in them,
    // so we temporarily set their content-type to parse them
    // with another plugin
    String contentType = contentMeta.get(Response.CONTENT_TYPE);
    if (description != null)
        text = description.getValue();
    if (text == null) {
        List<?> contents = entry.getContents();
        StringBuilder buf = new StringBuilder();
        for (Object syndContent : contents) {
            buf.append(((SyndContent) syndContent).getValue());
        }
        text = buf.toString();
    }
    try {
        Parser parser = parserFactory.getParsers(contentType, link)[0];
        parse = parser.getParse(new Content(link, link, text.getBytes(), contentType, contentMeta, conf)).get(link);
    } catch (ParserNotFound e) {
    /* ignore */
    }
    if (parse != null) {
        ParseData data = parse.getData();
        data.getContentMeta().remove(Response.CONTENT_TYPE);
        mergeMetadata(data.getParseMeta(), parseMeta);
        parseResult.put(link, new ParseText(parse.getText()), new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data.getContentMeta(), data.getParseMeta()));
    } else {
        contentMeta.remove(Response.CONTENT_TYPE);
        parseResult.put(link, new ParseText(text), new ParseData(ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, parseMeta));
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParserNotFound(org.apache.nutch.parse.ParserNotFound) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) Parser(org.apache.nutch.parse.Parser) ParseText(org.apache.nutch.parse.ParseText) SyndContent(com.rometools.rome.feed.synd.SyndContent) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) SyndContent(com.rometools.rome.feed.synd.SyndContent)

Example 9 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestHeadingsParseFilter method testExtractHeadingFromNestedNodes.

@Test
public void testExtractHeadingFromNestedNodes() throws IOException, SAXException {
    conf.setStrings("headings", "h1", "h2");
    HtmlParseFilter filter = new HeadingsParseFilter();
    filter.setConf(conf);
    Content content = new Content("http://www.foo.com/", "http://www.foo.com/", "".getBytes("UTF8"), "text/html; charset=UTF-8", new Metadata(), conf);
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    ParseResult parseResult = ParseResult.createParseResult("http://www.foo.com/", parse);
    HTMLMetaTags metaTags = new HTMLMetaTags();
    DOMFragmentParser parser = new DOMFragmentParser();
    DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
    parser.parse(new InputSource(new ByteArrayInputStream(("<html><head><title>test header with span element</title></head><body><h1>header with <span>span element</span></h1></body></html>").getBytes())), node);
    parseResult = filter.filter(content, parseResult, metaTags, node);
    Assert.assertEquals("The h1 tag must include the content of the inner span node", "header with span element", parseResult.get(content.getUrl()).getData().getParseMeta().get("h1"));
}
Also used : InputSource(org.xml.sax.InputSource) Metadata(org.apache.nutch.metadata.Metadata) DOMFragmentParser(org.cyberneko.html.parsers.DOMFragmentParser) HTMLDocumentImpl(org.apache.html.dom.HTMLDocumentImpl) ByteArrayInputStream(java.io.ByteArrayInputStream) Content(org.apache.nutch.protocol.Content) DocumentFragment(org.w3c.dom.DocumentFragment) Test(org.junit.Test)

Example 10 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestBasicIndexingFilter method testBasicIndexingFilter.

@Test
public void testBasicIndexingFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.setInt("indexer.max.title.length", 10);
    conf.setBoolean("indexer.add.domain", true);
    conf.setInt("indexer.max.content.length", 20);
    BasicIndexingFilter filter = new BasicIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);
    Inlinks inlinks = new Inlinks();
    try {
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
    Assert.assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
    Assert.assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", doc.getField("url").getValues().get(0));
    Assert.assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
    Assert.assertEquals("test fetch time", new Date(100L), (Date) doc.getField("tstamp").getValues().get(0));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Date(java.util.Date) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) BasicIndexingFilter(org.apache.nutch.indexer.basic.BasicIndexingFilter) Test(org.junit.Test)

Aggregations

Metadata (org.apache.nutch.metadata.Metadata)42 Configuration (org.apache.hadoop.conf.Configuration)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 ParseData (org.apache.nutch.parse.ParseData)19 Content (org.apache.nutch.protocol.Content)18 Test (org.junit.Test)17 Text (org.apache.hadoop.io.Text)16 Parse (org.apache.nutch.parse.Parse)16 ParseImpl (org.apache.nutch.parse.ParseImpl)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)14 Inlinks (org.apache.nutch.crawl.Inlinks)11 Outlink (org.apache.nutch.parse.Outlink)10 ParseStatus (org.apache.nutch.parse.ParseStatus)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)7 ParseResult (org.apache.nutch.parse.ParseResult)7 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 File (java.io.File)4 ArrayList (java.util.ArrayList)4 ParseUtil (org.apache.nutch.parse.ParseUtil)4