use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.
the class TestMSWordParser method getTextContent.
public String getTextContent(String fileName) throws ProtocolException, ParseException {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
return parse.getText();
}
use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.
the class TestPdfParser method testIt.
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
int index = parse.getText().indexOf(expectedText);
Assert.assertTrue(index > 0);
}
}
use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.
the class TestProtocolFile method setContentType.
/**
* Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
*
* @since NUTCH-384
*/
public void setContentType(String testTextFile) throws ProtocolException {
String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
Assert.assertNotNull(urlString);
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString), datum);
Assert.assertNotNull(output);
Assert.assertEquals("Status code: [" + output.getStatus().getCode() + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: [" + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output.getStatus().getCode());
Assert.assertNotNull(output.getContent());
Assert.assertNotNull(output.getContent().getContentType());
Assert.assertEquals(expectedMimeType, output.getContent().getContentType());
Assert.assertNotNull(output.getContent().getMetadata());
Assert.assertEquals(expectedMimeType, output.getContent().getMetadata().get(Response.CONTENT_TYPE));
}
use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.
the class IndexingFiltersChecker method getProtocolOutput.
protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
ProtocolFactory factory = new ProtocolFactory(getConf());
Protocol protocol = factory.getProtocol(url);
Text turl = new Text(url);
ProtocolOutput protocolOutput = protocol.getProtocolOutput(turl, datum);
return protocolOutput;
}
use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.
the class TestFeedParser method testParseFetchChannel.
/**
* Calls the {@link FeedParser} on a sample RSS file and checks that there are
* 3 {@link ParseResult} entries including the below 2 links:
* <ul>
* <li>http://www-scf.usc.edu/~mattmann/</li>
* <li>http://www.nutch.org</li>
* </ul>
*
* @throws ProtocolNotFound
* If the {@link Protocol}Layer cannot be loaded (required to fetch
* the {@link Content} for the RSS file).
* @throws ParseException
* If the {@link Parser}Layer cannot be loaded.
*/
@Test
public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
String urlString;
Protocol protocol;
Content content;
ParseResult parseResult;
Configuration conf = NutchConfiguration.create();
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
urlString = urlString.replace('\\', '/');
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
Assert.assertEquals(3, parseResult.size());
boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j.hasNext(); ) {
Map.Entry<Text, Parse> entry = j.next();
if (entry.getKey().toString().equals("http://www-scf.usc.edu/~mattmann/")) {
hasLink1 = true;
} else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
hasLink2 = true;
} else if (entry.getKey().toString().equals(urlString)) {
hasLink3 = true;
}
Assert.assertNotNull(entry.getValue());
Assert.assertNotNull(entry.getValue().getData());
}
if (!hasLink1 || !hasLink2 || !hasLink3) {
Assert.fail("Outlinks read from sample rss file are not correct!");
}
}
}
Aggregations