use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class FeedIndexingFilter method filter.
/**
* Extracts out the relevant fields:
*
* <ul>
* <li>FEED_AUTHOR</li>
* <li>FEED_TAGS</li>
* <li>FEED_PUBLISHED</li>
* <li>FEED_UPDATED</li>
* <li>FEED</li>
* </ul>
*
* And sends them to the {@link org.apache.nutch.indexer Indexer} for indexing within the Nutch index.
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
ParseData parseData = parse.getData();
Metadata parseMeta = parseData.getParseMeta();
String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
String published = parseMeta.get(Feed.FEED_PUBLISHED);
String updated = parseMeta.get(Feed.FEED_UPDATED);
String feed = parseMeta.get(Feed.FEED);
if (authors != null) {
for (String author : authors) {
doc.add(Feed.FEED_AUTHOR, author);
}
}
if (tags != null) {
for (String tag : tags) {
doc.add(Feed.FEED_TAGS, tag);
}
}
if (feed != null)
doc.add(Feed.FEED, feed);
if (published != null) {
Date date = new Date(Long.parseLong(published));
doc.add(PUBLISHED_DATE, date);
}
if (updated != null) {
Date date = new Date(Long.parseLong(updated));
doc.add(UPDATED_DATE, date);
}
return doc;
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class FeedParser method main.
/**
* Runs a command line version of this {@link Parser}.
*
* @param args
* A single argument (expected at arg[0]) representing a path on the
* local filesystem that points to a feed file.
*
* @throws Exception
* If any error occurs.
*/
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.err.println("Usage: FeedParser <feed>");
System.exit(1);
}
String name = args[0];
String url = "file:" + name;
Configuration conf = NutchConfiguration.create();
FeedParser parser = new FeedParser();
parser.setConf(conf);
File file = new File(name);
byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
in.close();
ParseResult parseResult = parser.getParse(new Content(url, url, bytes, "application/rss+xml", new Metadata(), conf));
for (Entry<Text, Parse> entry : parseResult) {
System.out.println("key: " + entry.getKey());
Parse parse = entry.getValue();
System.out.println("data: " + parse.getData());
System.out.println("text: " + parse.getText() + "\n");
}
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class FeedParser method addToMap.
private void addToMap(ParseResult parseResult, SyndFeed feed, String feedLink, SyndEntry entry, Content content) {
String link = entry.getLink(), text = null, title = null;
Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
Parse parse = null;
SyndContent description = entry.getDescription();
try {
link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
if (link != null)
link = filters.filter(link);
} catch (Exception e) {
e.printStackTrace();
return;
}
if (link == null)
return;
title = stripTags(entry.getTitleEx());
if (feedLink != null)
parseMeta.set("feed", feedLink);
addFields(parseMeta, contentMeta, feed, entry);
// some item descriptions contain markup text in them,
// so we temporarily set their content-type to parse them
// with another plugin
String contentType = contentMeta.get(Response.CONTENT_TYPE);
if (description != null)
text = description.getValue();
if (text == null) {
List<?> contents = entry.getContents();
StringBuilder buf = new StringBuilder();
for (Object syndContent : contents) {
buf.append(((SyndContent) syndContent).getValue());
}
text = buf.toString();
}
try {
Parser parser = parserFactory.getParsers(contentType, link)[0];
parse = parser.getParse(new Content(link, link, text.getBytes(), contentType, contentMeta, conf)).get(link);
} catch (ParserNotFound e) {
/* ignore */
}
if (parse != null) {
ParseData data = parse.getData();
data.getContentMeta().remove(Response.CONTENT_TYPE);
mergeMetadata(data.getParseMeta(), parseMeta);
parseResult.put(link, new ParseText(parse.getText()), new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data.getContentMeta(), data.getParseMeta()));
} else {
contentMeta.remove(Response.CONTENT_TYPE);
parseResult.put(link, new ParseText(text), new ParseData(ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, parseMeta));
}
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestHeadingsParseFilter method testExtractHeadingFromNestedNodes.
@Test
public void testExtractHeadingFromNestedNodes() throws IOException, SAXException {
conf.setStrings("headings", "h1", "h2");
HtmlParseFilter filter = new HeadingsParseFilter();
filter.setConf(conf);
Content content = new Content("http://www.foo.com/", "http://www.foo.com/", "".getBytes("UTF8"), "text/html; charset=UTF-8", new Metadata(), conf);
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
ParseResult parseResult = ParseResult.createParseResult("http://www.foo.com/", parse);
HTMLMetaTags metaTags = new HTMLMetaTags();
DOMFragmentParser parser = new DOMFragmentParser();
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
parser.parse(new InputSource(new ByteArrayInputStream(("<html><head><title>test header with span element</title></head><body><h1>header with <span>span element</span></h1></body></html>").getBytes())), node);
parseResult = filter.filter(content, parseResult, metaTags, node);
Assert.assertEquals("The h1 tag must include the content of the inner span node", "header with span element", parseResult.get(content.getUrl()).getData().getParseMeta().get("h1"));
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestBasicIndexingFilter method testBasicIndexingFilter.
@Test
public void testBasicIndexingFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setInt("indexer.max.title.length", 10);
conf.setBoolean("indexer.add.domain", true);
conf.setInt("indexer.max.content.length", 20);
BasicIndexingFilter filter = new BasicIndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
String title = "The Foo Page";
Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
Metadata metaData = new Metadata();
metaData.add("Language", "en/us");
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
CrawlDatum crawlDatum = new CrawlDatum();
crawlDatum.setFetchTime(100L);
Inlinks inlinks = new Inlinks();
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
Assert.assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
Assert.assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", doc.getField("url").getValues().get(0));
Assert.assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
Assert.assertEquals("test fetch time", new Date(100L), (Date) doc.getField("tstamp").getValues().get(0));
}
Aggregations