use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class TestLinksIndexingFilter method testIndexOnlyHostPart.
@Test
public void testIndexOnlyHostPart() throws Exception {
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
filter.setConf(conf);
Outlink[] outlinks = generateOutlinks(true);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", "example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
NutchField docOutlinks = doc.getField("outlinks");
Assert.assertEquals("Only the host portion of the outlink URL must be indexed", new URL("http://www.test.com").getHost(), docOutlinks.getValues().get(0));
Assert.assertEquals("The inlinks coming from the same host must count only once", 1, doc.getField("inlinks").getValues().size());
Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class MimeTypeIndexingFilter method main.
/**
* Main method for invoking this tool
*
* @throws IOException
* @throws IndexingException
*/
public static void main(String[] args) throws IOException, IndexingException {
Option helpOpt = new Option("h", "help", false, "show this help message");
Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
Options options = new Options();
options.addOption(helpOpt).addOption(rulesOpt);
CommandLineParser parser = new GnuParser();
HelpFormatter formatter = new HelpFormatter();
String rulesFile;
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("rules")) {
formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
return;
}
rulesFile = line.getOptionValue("rules");
} catch (UnrecognizedOptionException e) {
formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
return;
} catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
e.printStackTrace();
return;
}
MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
Configuration conf = NutchConfiguration.create();
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
filter.setConf(conf);
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
while ((line = in.readLine()) != null && !line.isEmpty()) {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_TYPE, line);
ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (doc != null) {
System.out.print("+ ");
System.out.println(line);
} else {
System.out.print("- ");
System.out.println(line);
}
}
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class DOMContentUtils method getOutlinks.
/**
* This method finds all anchors below the supplied DOM <code>node</code>, and
* creates appropriate {@link Outlink} records for each (relative to the
* supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
* {@link ArrayList}.
*
* <p>
*
* Links without inner structure (tags, text, etc) are discarded, as are links
* which contain only single nested links and empty text nodes (this is a
* common DOM-fixup artifact, at least with nekohtml).
*/
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
NodeList children = currentNode.getChildNodes();
int childLen = (children != null) ? children.getLength() : 0;
if (nodeType == Node.ELEMENT_NODE) {
nodeName = nodeName.toLowerCase();
LinkParams params = (LinkParams) linkParams.get(nodeName);
if (params != null) {
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
StringBuffer linkText = new StringBuffer();
getText(linkText, currentNode, true);
if (linkText.toString().trim().length() == 0) {
// try harder - use img alt if present
NodeWalker subWalker = new NodeWalker(currentNode);
while (subWalker.hasNext()) {
Node subNode = subWalker.nextNode();
if (subNode.getNodeType() == Node.ELEMENT_NODE) {
if (subNode.getNodeName().toLowerCase().equals("img")) {
NamedNodeMap subAttrs = subNode.getAttributes();
Node alt = subAttrs.getNamedItem("alt");
if (alt != null) {
String altTxt = alt.getTextContent();
if (altTxt != null && altTxt.trim().length() > 0) {
if (linkText.length() > 0)
linkText.append(' ');
linkText.append(altTxt);
}
}
} else {
// ignore other types of elements
}
} else if (subNode.getNodeType() == Node.TEXT_NODE) {
String txt = subNode.getTextContent();
if (txt != null && txt.length() > 0) {
if (linkText.length() > 0)
linkText.append(' ');
linkText.append(txt);
}
}
}
}
NamedNodeMap attrs = currentNode.getAttributes();
String target = null;
boolean noFollow = false;
boolean post = false;
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
} else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
noFollow = true;
} else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getNodeValue())) {
post = true;
}
}
if (target != null && !noFollow && !post)
try {
URL url = URLUtil.resolveURL(base, target);
Outlink outlink = new Outlink(url.toString(), linkText.toString().trim());
outlinks.add(outlink);
// the outlink metadata
if (keepNodenames) {
MapWritable metadata = new MapWritable();
metadata.put(new Text(srcTagMetaName), new Text(nodeName));
outlink.setMetadata(metadata);
}
} catch (MalformedURLException e) {
// don't care
}
}
// this should not have any children, skip them
if (params.childLen == 0)
continue;
}
}
}
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class JSParseFilter method walk.
private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) {
if (n instanceof Element) {
String name = n.getNodeName();
if (name.equalsIgnoreCase("script")) {
/*
* String lang = null; Node lNode =
* n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
* "javascript"; else lang = lNode.getNodeValue();
*/
StringBuffer script = new StringBuffer();
NodeList nn = n.getChildNodes();
if (nn.getLength() > 0) {
for (int i = 0; i < nn.getLength(); i++) {
if (i > 0)
script.append('\n');
script.append(nn.item(i).getNodeValue());
}
// if (LOG.isInfoEnabled()) {
// LOG.info("script: language=" + lang + ", text: " +
// script.toString());
// }
Outlink[] links = getJSLinks(script.toString(), "", base);
if (links != null && links.length > 0)
outlinks.addAll(Arrays.asList(links));
// no other children of interest here, go one level up.
return;
}
} else {
// process all HTML 4.0 events, if present...
NamedNodeMap attrs = n.getAttributes();
int len = attrs.getLength();
for (int i = 0; i < len; i++) {
// Window: onload,onunload
// Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
// Keyboard: onkeydown,onkeypress,onkeyup
// Mouse:
// onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
Node anode = attrs.item(i);
Outlink[] links = null;
if (anode.getNodeName().startsWith("on")) {
links = getJSLinks(anode.getNodeValue(), "", base);
} else if (anode.getNodeName().equalsIgnoreCase("href")) {
String val = anode.getNodeValue();
if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
links = getJSLinks(val, "", base);
}
}
if (links != null && links.length > 0)
outlinks.addAll(Arrays.asList(links));
}
}
}
NodeList nl = n.getChildNodes();
for (int i = 0; i < nl.getLength(); i++) {
walk(nl.item(i), parse, metaTags, base, outlinks);
}
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class JSParseFilter method getJSLinks.
// Alternative pattern, which limits valid url characters.
// private static final String URI_PATTERN =
// "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
/**
* This method extracts URLs from literals embedded in JavaScript.
*/
private Outlink[] getJSLinks(String plainText, String anchor, String base) {
final List<Outlink> outlinks = new ArrayList<Outlink>();
URL baseURL = null;
try {
baseURL = new URL(base);
} catch (Exception e) {
if (LOG.isErrorEnabled()) {
LOG.error("getJSLinks", e);
}
}
try {
final PatternCompiler cp = new Perl5Compiler();
final Pattern pattern = cp.compile(STRING_PATTERN, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK | Perl5Compiler.MULTILINE_MASK);
final Pattern pattern1 = cp.compile(URI_PATTERN, Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK | Perl5Compiler.MULTILINE_MASK);
final PatternMatcher matcher = new Perl5Matcher();
final PatternMatcher matcher1 = new Perl5Matcher();
final PatternMatcherInput input = new PatternMatcherInput(plainText);
MatchResult result;
String url;
// loop the matches
while (matcher.contains(input, pattern)) {
result = matcher.getMatch();
url = result.group(2);
PatternMatcherInput input1 = new PatternMatcherInput(url);
if (!matcher1.matches(input1, pattern1)) {
// }
continue;
}
if (url.startsWith("www.")) {
url = "http://" + url;
} else {
// the next match.
try {
url = new URL(baseURL, url).toString();
} catch (MalformedURLException ex) {
if (LOG.isTraceEnabled()) {
LOG.trace(" - failed URL parse '" + url + "' and baseURL '" + baseURL + "'", ex);
}
continue;
}
}
url = url.replaceAll("&", "&");
if (LOG.isTraceEnabled()) {
LOG.trace(" - outlink from JS: '" + url + "'");
}
outlinks.add(new Outlink(url, anchor));
}
} catch (Exception ex) {
// extraction.
if (LOG.isErrorEnabled()) {
LOG.error("getJSLinks", ex);
}
}
final Outlink[] retval;
// create array of the Outlinks
if (outlinks != null && outlinks.size() > 0) {
retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
} else {
retval = new Outlink[0];
}
return retval;
}
Aggregations