use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestProtocolHttp method fetchPage.
/**
* Fetches the specified <code>page</code> from the local Jetty server and
* checks whether the HTTP response status code matches with the expected
* code. Also use jsp pages for redirection.
*
* @param page
* Page to be fetched.
* @param expectedCode
* HTTP response status code expected while fetching the page.
*/
private void fetchPage(String page, int expectedCode) throws Exception {
URL url = new URL("http", "127.0.0.1", port, page);
CrawlDatum crawlDatum = new CrawlDatum();
Response response = http.getResponse(url, crawlDatum, true);
ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum);
Content content = out.getContent();
assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) {
assertEquals("ContentType " + url, "text/html", content.getContentType());
}
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestRTFParser method testIt.
@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
String text = parse.getText();
Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
String title = parse.getData().getTitle();
Metadata meta = parse.getData().getParseMeta();
Assert.assertEquals("test rft document", title);
Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class EncodingDetector method main.
public static void main(String[] args) throws IOException {
if (args.length != 1) {
System.err.println("Usage: EncodingDetector <file>");
System.exit(1);
}
Configuration conf = NutchConfiguration.create();
EncodingDetector detector = new EncodingDetector(NutchConfiguration.create());
// do everything as bytes; don't want any conversion
@SuppressWarnings("resource") BufferedInputStream istr = new BufferedInputStream(new FileInputStream(args[0]));
ByteArrayOutputStream ostr = new ByteArrayOutputStream();
byte[] bytes = new byte[1000];
boolean more = true;
while (more) {
int len = istr.read(bytes);
if (len < bytes.length) {
more = false;
if (len > 0) {
ostr.write(bytes, 0, len);
}
} else {
ostr.write(bytes);
}
}
byte[] data = ostr.toByteArray();
// make a fake Content
Content content = new Content("", "", data, "text/html", new Metadata(), conf);
detector.autoDetectClues(content, true);
String encoding = detector.guessEncoding(content, conf.get("parser.character.encoding.default"));
System.out.println("Guessed encoding: " + encoding);
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class FeedParser method main.
/**
* Runs a command line version of this {@link Parser}.
*
* @param args
* A single argument (expected at arg[0]) representing a path on the
* local filesystem that points to a feed file.
*
* @throws Exception
* If any error occurs.
*/
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.err.println("Usage: FeedParser <feed>");
System.exit(1);
}
String name = args[0];
String url = "file:" + name;
Configuration conf = NutchConfiguration.create();
FeedParser parser = new FeedParser();
parser.setConf(conf);
File file = new File(name);
byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
in.close();
ParseResult parseResult = parser.getParse(new Content(url, url, bytes, "application/rss+xml", new Metadata(), conf));
for (Entry<Text, Parse> entry : parseResult) {
System.out.println("key: " + entry.getKey());
Parse parse = entry.getValue();
System.out.println("data: " + parse.getData());
System.out.println("text: " + parse.getText() + "\n");
}
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class FeedParser method addToMap.
private void addToMap(ParseResult parseResult, SyndFeed feed, String feedLink, SyndEntry entry, Content content) {
String link = entry.getLink(), text = null, title = null;
Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
Parse parse = null;
SyndContent description = entry.getDescription();
try {
link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
if (link != null)
link = filters.filter(link);
} catch (Exception e) {
e.printStackTrace();
return;
}
if (link == null)
return;
title = stripTags(entry.getTitleEx());
if (feedLink != null)
parseMeta.set("feed", feedLink);
addFields(parseMeta, contentMeta, feed, entry);
// some item descriptions contain markup text in them,
// so we temporarily set their content-type to parse them
// with another plugin
String contentType = contentMeta.get(Response.CONTENT_TYPE);
if (description != null)
text = description.getValue();
if (text == null) {
List<?> contents = entry.getContents();
StringBuilder buf = new StringBuilder();
for (Object syndContent : contents) {
buf.append(((SyndContent) syndContent).getValue());
}
text = buf.toString();
}
try {
Parser parser = parserFactory.getParsers(contentType, link)[0];
parse = parser.getParse(new Content(link, link, text.getBytes(), contentType, contentMeta, conf)).get(link);
} catch (ParserNotFound e) {
/* ignore */
}
if (parse != null) {
ParseData data = parse.getData();
data.getContentMeta().remove(Response.CONTENT_TYPE);
mergeMetadata(data.getParseMeta(), parseMeta);
parseResult.put(link, new ParseText(parse.getText()), new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data.getContentMeta(), data.getParseMeta()));
} else {
contentMeta.remove(Response.CONTENT_TYPE);
parseResult.put(link, new ParseText(text), new ParseData(ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, parseMeta));
}
}
Aggregations