use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class MimeTypeIndexingFilterTest method testMissingConfigFile.
@Test
public void testMissingConfigFile() throws Exception {
String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
Assert.assertEquals(String.format("Property %s must not be present in the the configuration file", MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
filter.setConf(conf);
// property not set so in this cases all documents must pass the filter
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i], new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertNotNull("All documents must be allowed by default", doc);
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestExtParser method setUp.
@Before
protected void setUp() throws ProtocolException, IOException {
// prepare a temp file with expectedText as its content
// This system property is defined in ./src/plugin/build-plugin.xml
String path = System.getProperty("test.data");
if (path != null) {
File tempDir = new File(path);
if (!tempDir.exists())
tempDir.mkdir();
tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt", tempDir);
} else {
// otherwise in java.io.tmpdir
tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt");
}
urlString = tempFile.toURI().toURL().toString();
FileOutputStream fos = new FileOutputStream(tempFile);
fos.write(expectedText.getBytes());
fos.close();
// get nutch content
Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
protocol = null;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestRTFParser method testIt.
@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
String text = parse.getText();
Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
String title = parse.getData().getTitle();
Metadata meta = parse.getData().getParseMeta();
Assert.assertEquals("test rft document", title);
Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestZipParser method testIt.
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(content.getUrl());
Assert.assertTrue("Extracted text does not start with <" + expectedText + ">: <" + parse.getText() + ">", parse.getText().startsWith(expectedText));
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class File method main.
/**
* Quick way for running this class. Useful for debugging.
*/
public static void main(String[] args) throws Exception {
int maxContentLength = Integer.MIN_VALUE;
boolean dumpContent = false;
String urlString = null;
String usage = "Usage: File [-maxContentLength L] [-dumpContent] url";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-maxContentLength")) {
maxContentLength = Integer.parseInt(args[++i]);
} else if (args[i].equals("-dumpContent")) {
dumpContent = true;
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
} else
urlString = args[i];
}
File file = new File();
file.setConf(NutchConfiguration.create());
if (// set maxContentLength
maxContentLength != Integer.MIN_VALUE)
file.setMaxContentLength(maxContentLength);
// set log level
// LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new CrawlDatum());
Content content = output.getContent();
System.err.println("URL: " + content.getUrl());
System.err.println("Status: " + output.getStatus());
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED));
String redirectLocation = content.getMetadata().get("Location");
if (redirectLocation != null) {
System.err.println("Location: " + redirectLocation);
}
if (dumpContent) {
System.out.print(new String(content.getContent()));
}
file = null;
}
Aggregations