use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class HtmlParser method getParse.
public ParseResult getParse(Content content) {
HTMLMetaTags metaTags = new HTMLMetaTags();
URL base;
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
Metadata metadata = new Metadata();
// parse the content
DocumentFragment root;
try {
byte[] contentInOctets = content.getContent();
InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
String encoding = detector.guessEncoding(content, defaultCharEncoding);
metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
input.setEncoding(encoding);
if (LOG.isTraceEnabled()) {
LOG.trace("Parsing...");
}
root = parse(input);
} catch (IOException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (DOMException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (SAXException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (Exception e) {
LOG.error("Error: ", e);
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
// populate Nutch metadata with HTML meta directives
metadata.addAll(metaTags.getGeneralTags());
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}
// check meta directives
if (!metaTags.getNoIndex()) {
// okay to index
StringBuffer sb = new StringBuffer();
if (LOG.isTraceEnabled()) {
LOG.trace("Getting text...");
}
// extract text
utils.getText(sb, root);
text = sb.toString();
sb.setLength(0);
if (LOG.isTraceEnabled()) {
LOG.trace("Getting title...");
}
// extract title
utils.getTitle(sb, root);
title = sb.toString().trim();
}
if (!metaTags.getNoFollow()) {
// okay to follow links
// extract outlinks
ArrayList<Outlink> l = new ArrayList<Outlink>();
URL baseTag = base;
String baseTagHref = utils.getBase(root);
if (baseTagHref != null) {
try {
baseTag = new URL(base, baseTagHref);
} catch (MalformedURLException e) {
baseTag = base;
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("Getting links...");
}
utils.getOutlinks(baseTag, l, root);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
}
}
ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
if (metaTags.getRefresh()) {
status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
}
ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// run filters on parse
ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
if (metaTags.getNoCache()) {
// not okay to cache
for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
}
return filteredParse;
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class TestDOMContentUtils method setup.
@Before
public void setup() {
conf = NutchConfiguration.create();
conf.setBoolean("parser.html.form.use_action", true);
utils = new DOMContentUtils(conf);
DOMFragmentParser parser = new DOMFragmentParser();
try {
parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", true);
} catch (SAXException e) {
}
for (int i = 0; i < testPages.length; i++) {
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
try {
parser.parse(new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), node);
testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
} catch (Exception e) {
Assert.assertTrue("caught exception: " + e, false);
}
testDOMs[i] = node;
}
try {
answerOutlinks = new Outlink[][] { { new Outlink("http://www.nutch.org", "anchor") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/bot.html", "bots") }, { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/1", "1"), new Outlink("http://www.nutch.org/docs/2", "2") }, { new Outlink("http://www.nutch.org/frames/top.html", ""), new Outlink("http://www.nutch.org/frames/left.html", ""), new Outlink("http://www.nutch.org/frames/invalid.html", ""), new Outlink("http://www.nutch.org/frames/right.html", "") }, { new Outlink("http://www.nutch.org/maps/logo.gif", ""), new Outlink("http://www.nutch.org/index.html", ""), new Outlink("http://www.nutch.org/maps/#bottom", ""), new Outlink("http://www.nutch.org/bot.html", ""), new Outlink("http://www.nutch.org/docs/index.html", "") }, { new Outlink("http://www.nutch.org/index.html", "whitespace test") }, {}, { new Outlink("http://www.nutch.org/dummy.jsp", "test2") }, {}, { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, { // this is tricky - see RFC3986 section 5.4.1 example 7
new Outlink("http://www.nutch.org/g", "anchor1"), new Outlink("http://www.nutch.org/g?y#s", "anchor2"), new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") }, { new Outlink("http://www.nutch.org/g", ""), new Outlink("http://www.nutch.org/g1", ""), new Outlink("http://www.nutch.org/g2", "bla bla"), new Outlink("http://www.nutch.org/test.gif", "bla bla") }, { new Outlink("http://www.nutch.org/movie.mp4", "") } };
} catch (MalformedURLException e) {
}
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class TestHtmlParser method testResolveBaseUrl.
@Test
public void testResolveBaseUrl() {
byte[] contentBytes = resolveBaseUrlTestContent.getBytes(StandardCharsets.UTF_8);
// parse using http://example.com/ as "fetch" URL
Parse parse = parse(contentBytes);
LOG.info(parse.getData().toString());
Outlink[] outlinks = parse.getData().getOutlinks();
Assert.assertEquals(1, outlinks.length);
Assert.assertEquals("http://www.example.com/index.html", outlinks[0].getToUrl());
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class JSParseFilter method getParse.
public ParseResult getParse(Content c) {
String type = c.getContentType();
if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT, "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
String script = new String(c.getContent());
Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
if (outlinks == null)
outlinks = new Outlink[0];
// Title? use the first line of the script...
String title;
int idx = script.indexOf('\n');
if (idx != -1) {
if (idx > MAX_TITLE_LEN)
idx = MAX_TITLE_LEN;
title = script.substring(0, idx);
} else {
idx = Math.min(MAX_TITLE_LEN, script.length());
title = script.substring(0, idx);
}
ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, c.getMetadata());
return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class JSParseFilter method filter.
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
String url = content.getBaseUrl();
ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
walk(doc, parse, metaTags, url, outlinks);
if (outlinks.size() > 0) {
Outlink[] old = parse.getData().getOutlinks();
String title = parse.getData().getTitle();
List<Outlink> list = Arrays.asList(old);
outlinks.addAll(list);
ParseStatus status = parse.getData().getStatus();
String text = parse.getText();
Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
ParseData parseData = new ParseData(status, title, newlinks, parse.getData().getContentMeta(), parse.getData().getParseMeta());
// replace original parse obj with new one
parseResult.put(content.getUrl(), new ParseText(text), parseData);
}
return parseResult;
}
Aggregations