use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestZipParser method testIt.
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(content.getUrl());
Assert.assertTrue("Extracted text does not start with <" + expectedText + ">: <" + parse.getText() + ">", parse.getText().startsWith(expectedText));
}
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestRegexParseFilter method testPositiveFilter.
public void testPositiveFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
conf.set("parsefilter.regex.file", file);
RegexParseFilter filter = new RegexParseFilter();
filter.setConf(conf);
String url = "http://nutch.apache.org/";
String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
ParseResult result = ParseResult.createParseResult(url, parse);
result = filter.filter(content, result, null, null);
Metadata meta = parse.getData().getParseMeta();
assertEquals("true", meta.get("first"));
assertEquals("true", meta.get("second"));
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestRegexParseFilter method testNegativeFilter.
public void testNegativeFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
conf.set("parsefilter.regex.file", file);
RegexParseFilter filter = new RegexParseFilter();
filter.setConf(conf);
String url = "http://nutch.apache.org/";
String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
ParseResult result = ParseResult.createParseResult(url, parse);
result = filter.filter(content, result, null, null);
Metadata meta = parse.getData().getParseMeta();
assertEquals("false", meta.get("first"));
assertEquals("false", meta.get("second"));
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class Ftp method main.
/**
* For debugging.
* @param args run with no args for help
* @throws Exception if there is an error running this program
*/
public static void main(String[] args) throws Exception {
int timeout = Integer.MIN_VALUE;
int maxContentLength = Integer.MIN_VALUE;
@SuppressWarnings("unused") String logLevel = "info";
boolean followTalk = false;
boolean keepConnection = false;
boolean dumpContent = false;
String urlString = null;
String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-logLevel")) {
logLevel = args[++i];
} else if (args[i].equals("-followTalk")) {
followTalk = true;
} else if (args[i].equals("-keepConnection")) {
keepConnection = true;
} else if (args[i].equals("-timeout")) {
timeout = Integer.parseInt(args[++i]) * 1000;
} else if (args[i].equals("-maxContentLength")) {
maxContentLength = Integer.parseInt(args[++i]);
} else if (args[i].equals("-dumpContent")) {
dumpContent = true;
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
} else {
urlString = args[i];
}
}
Ftp ftp = new Ftp();
ftp.setFollowTalk(followTalk);
ftp.setKeepConnection(keepConnection);
if (// set timeout
timeout != Integer.MIN_VALUE)
ftp.setTimeout(timeout);
if (// set maxContentLength
maxContentLength != Integer.MIN_VALUE)
ftp.setMaxContentLength(maxContentLength);
// set log level
// LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED));
if (dumpContent) {
System.out.print(new String(content.getContent()));
}
ftp = null;
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class File method main.
/**
* Quick way for running this class. Useful for debugging.
* @param args run with no args to print help
* @throws Exception if there is a fatal error running this class
* with the given input
*/
public static void main(String[] args) throws Exception {
int maxContentLength = Integer.MIN_VALUE;
boolean dumpContent = false;
String urlString = null;
String usage = "Usage: File [-maxContentLength L] [-dumpContent] url";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-maxContentLength")) {
maxContentLength = Integer.parseInt(args[++i]);
} else if (args[i].equals("-dumpContent")) {
dumpContent = true;
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
} else
urlString = args[i];
}
File file = new File();
file.setConf(NutchConfiguration.create());
if (// set maxContentLength
maxContentLength != Integer.MIN_VALUE)
file.setMaxContentLength(maxContentLength);
// set log level
// LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new CrawlDatum());
Content content = output.getContent();
System.err.println("URL: " + content.getUrl());
System.err.println("Status: " + output.getStatus());
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED));
String redirectLocation = content.getMetadata().get("Location");
if (redirectLocation != null) {
System.err.println("Location: " + redirectLocation);
}
if (dumpContent) {
System.out.print(new String(content.getContent()));
}
file = null;
}
Aggregations