use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class MimeTypeIndexingFilterTest method setUp.
@Before
public void setUp() throws Exception {
for (int i = 0; i < MIME_TYPES.length; i++) {
Metadata metadata = new Metadata();
metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
parses[i] = parse;
}
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class ExtParser method getParse.
public ParseResult getParse(Content content) {
String contentType = content.getContentType();
String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
if (params == null)
return new ParseStatus(ParseStatus.FAILED, "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());
String command = params[0];
int timeout = Integer.parseInt(params[1]);
String encoding = params[2];
if (LOG.isTraceEnabled()) {
LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
}
String text = null;
String title = null;
try {
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete " + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
}
ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
CommandRunner cr = new CommandRunner();
cr.setCommand(command + " " + contentType);
cr.setInputStream(new ByteArrayInputStream(raw));
cr.setStdOutputStream(os);
cr.setStdErrorStream(es);
cr.setTimeout(timeout);
cr.evaluate();
if (cr.getExitValue() != 0)
return new ParseStatus(ParseStatus.FAILED, "External command " + command + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());
text = os.toString(encoding);
} catch (Exception e) {
// run time exception
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
if (text == null)
text = "";
if (title == null)
title = "";
// collect outlink
Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, content.getMetadata());
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class HtmlParser method getParse.
public ParseResult getParse(Content content) {
HTMLMetaTags metaTags = new HTMLMetaTags();
URL base;
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
Metadata metadata = new Metadata();
// parse the content
DocumentFragment root;
try {
byte[] contentInOctets = content.getContent();
InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
String encoding = detector.guessEncoding(content, defaultCharEncoding);
metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
input.setEncoding(encoding);
if (LOG.isTraceEnabled()) {
LOG.trace("Parsing...");
}
root = parse(input);
} catch (IOException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (DOMException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (SAXException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (Exception e) {
LOG.error("Error: ", e);
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
// populate Nutch metadata with HTML meta directives
metadata.addAll(metaTags.getGeneralTags());
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}
// check meta directives
if (!metaTags.getNoIndex()) {
// okay to index
StringBuffer sb = new StringBuffer();
if (LOG.isTraceEnabled()) {
LOG.trace("Getting text...");
}
// extract text
utils.getText(sb, root);
text = sb.toString();
sb.setLength(0);
if (LOG.isTraceEnabled()) {
LOG.trace("Getting title...");
}
// extract title
utils.getTitle(sb, root);
title = sb.toString().trim();
}
if (!metaTags.getNoFollow()) {
// okay to follow links
// extract outlinks
ArrayList<Outlink> l = new ArrayList<Outlink>();
URL baseTag = base;
String baseTagHref = utils.getBase(root);
if (baseTagHref != null) {
try {
baseTag = new URL(base, baseTagHref);
} catch (MalformedURLException e) {
baseTag = base;
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("Getting links...");
}
utils.getOutlinks(baseTag, l, root);
outlinks = l.toArray(new Outlink[l.size()]);
if (LOG.isTraceEnabled()) {
LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
}
}
ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
if (metaTags.getRefresh()) {
status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
}
ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// run filters on parse
ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
if (metaTags.getNoCache()) {
// not okay to cache
for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
}
return filteredParse;
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class TestDOMContentUtils method setup.
@Before
public void setup() {
conf = NutchConfiguration.create();
conf.setBoolean("parser.html.form.use_action", true);
utils = new DOMContentUtils(conf);
DOMFragmentParser parser = new DOMFragmentParser();
try {
parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", true);
} catch (SAXException e) {
}
for (int i = 0; i < testPages.length; i++) {
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
try {
parser.parse(new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), node);
testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
} catch (Exception e) {
Assert.assertTrue("caught exception: " + e, false);
}
testDOMs[i] = node;
}
try {
answerOutlinks = new Outlink[][] { { new Outlink("http://www.nutch.org", "anchor") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/bot.html", "bots") }, { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/1", "1"), new Outlink("http://www.nutch.org/docs/2", "2") }, { new Outlink("http://www.nutch.org/frames/top.html", ""), new Outlink("http://www.nutch.org/frames/left.html", ""), new Outlink("http://www.nutch.org/frames/invalid.html", ""), new Outlink("http://www.nutch.org/frames/right.html", "") }, { new Outlink("http://www.nutch.org/maps/logo.gif", ""), new Outlink("http://www.nutch.org/index.html", ""), new Outlink("http://www.nutch.org/maps/#bottom", ""), new Outlink("http://www.nutch.org/bot.html", ""), new Outlink("http://www.nutch.org/docs/index.html", "") }, { new Outlink("http://www.nutch.org/index.html", "whitespace test") }, {}, { new Outlink("http://www.nutch.org/dummy.jsp", "test2") }, {}, { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, { // this is tricky - see RFC3986 section 5.4.1 example 7
new Outlink("http://www.nutch.org/g", "anchor1"), new Outlink("http://www.nutch.org/g?y#s", "anchor2"), new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") }, { new Outlink("http://www.nutch.org/g", ""), new Outlink("http://www.nutch.org/g1", ""), new Outlink("http://www.nutch.org/g2", "bla bla"), new Outlink("http://www.nutch.org/test.gif", "bla bla") }, { new Outlink("http://www.nutch.org/movie.mp4", "") } };
} catch (MalformedURLException e) {
}
}
use of org.apache.nutch.parse.Outlink in project nutch by apache.
the class TestHtmlParser method testResolveBaseUrl.
@Test
public void testResolveBaseUrl() {
byte[] contentBytes = resolveBaseUrlTestContent.getBytes(StandardCharsets.UTF_8);
// parse using http://example.com/ as "fetch" URL
Parse parse = parse(contentBytes);
LOG.info(parse.getData().toString());
Outlink[] outlinks = parse.getData().getOutlinks();
Assert.assertEquals(1, outlinks.length);
Assert.assertEquals("http://www.example.com/index.html", outlinks[0].getToUrl());
}
Aggregations