use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class LinksIndexingFilter method filter.
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// Add the outlinks
Outlink[] outlinks = parse.getData().getOutlinks();
if (outlinks != null) {
Set<String> hosts = new HashSet<String>();
for (Outlink outlink : outlinks) {
try {
String linkUrl = outlink.getToUrl();
String outHost = new URL(linkUrl).getHost().toLowerCase();
if (indexHost) {
linkUrl = outHost;
if (hosts.contains(linkUrl))
continue;
hosts.add(linkUrl);
}
addFilteredLink("outlinks", url.toString(), linkUrl, outHost, filterOutlinks, doc);
} catch (MalformedURLException e) {
LOG.error("Malformed URL in {}: {}", url, e.getMessage());
}
}
}
// Add the inlinks
if (null != inlinks) {
Iterator<Inlink> iterator = inlinks.iterator();
Set<String> inlinkHosts = new HashSet<String>();
while (iterator.hasNext()) {
try {
Inlink link = iterator.next();
String linkUrl = link.getFromUrl();
String inHost = new URL(linkUrl).getHost().toLowerCase();
if (indexHost) {
linkUrl = inHost;
if (inlinkHosts.contains(linkUrl))
continue;
inlinkHosts.add(linkUrl);
}
addFilteredLink("inlinks", url.toString(), linkUrl, inHost, filterInlinks, doc);
} catch (MalformedURLException e) {
LOG.error("Malformed URL in {}: {}", url, e.getMessage());
}
}
}
return doc;
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class TestLinksIndexingFilter method testIndexHostsOnlyAndFilterOutlinks.
@Test
public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
conf = NutchConfiguration.create();
conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
Outlink[] outlinks = generateOutlinks(true);
filter.setConf(conf);
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
Assert.assertEquals("Index only the host portion of the outlinks after filtering", new URL("http://www.test.com").getHost(), doc.getFieldValue("outlinks"));
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class TestMoreIndexingFilter method testContentDispositionTitle.
@Test
public void testContentDispositionTitle() throws IndexingException {
Configuration conf = NutchConfiguration.create();
Metadata metadata = new Metadata();
metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
Text url = new Text("http://www.example.com/");
ParseImpl parseImpl = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
NutchDocument doc = new NutchDocument();
doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
/* NUTCH-1140: do not add second title to avoid a multi-valued title field */
doc = new NutchDocument();
doc.add("title", "title");
doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
Assert.assertEquals("do not add second title by content-disposition", "title", doc.getFieldValue("title"));
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class MimeTypeIndexingFilterTest method setUp.
@Before
public void setUp() throws Exception {
for (int i = 0; i < MIME_TYPES.length; i++) {
Metadata metadata = new Metadata();
metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
parses[i] = parse;
}
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class ExtParser method getParse.
public ParseResult getParse(Content content) {
String contentType = content.getContentType();
String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
if (params == null)
return new ParseStatus(ParseStatus.FAILED, "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());
String command = params[0];
int timeout = Integer.parseInt(params[1]);
String encoding = params[2];
if (LOG.isTraceEnabled()) {
LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
}
String text = null;
String title = null;
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete " + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
}
ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
CommandRunner cr = new CommandRunner();
cr.setCommand(command + " " + contentType);
cr.setInputStream(new ByteArrayInputStream(raw));
cr.setStdOutputStream(os);
cr.setStdErrorStream(es);
cr.setTimeout(timeout);
cr.evaluate();
if (cr.getExitValue() != 0)
return new ParseStatus(ParseStatus.FAILED, "External command " + command + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());
text = os.toString(encoding);
} catch (Exception e) {
// run time exception
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata());
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
Aggregations