use of org.apache.nutch.protocol.Protocol in project nutch by apache.
the class TestImageMetadata method testIt.
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
Assert.assertEquals("121", parse.getData().getMeta("width"));
Assert.assertEquals("48", parse.getData().getMeta("height"));
}
}
use of org.apache.nutch.protocol.Protocol in project nutch by apache.
the class TestMSWordParser method getTextContent.
public String getTextContent(String fileName) throws ProtocolException, ParseException {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
return parse.getText();
}
use of org.apache.nutch.protocol.Protocol in project nutch by apache.
the class TestPdfParser method testIt.
@Test
public void testIt() throws ProtocolException, ParseException {
String urlString;
Protocol protocol;
Content content;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
int index = parse.getText().indexOf(expectedText);
Assert.assertTrue(index > 0);
}
}
use of org.apache.nutch.protocol.Protocol in project nutch by apache.
the class TestProtocolFile method setContentType.
/**
* Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
*
* @since NUTCH-384
*/
public void setContentType(String testTextFile) throws ProtocolException {
String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
Assert.assertNotNull(urlString);
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString), datum);
Assert.assertNotNull(output);
Assert.assertEquals("Status code: [" + output.getStatus().getCode() + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: [" + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output.getStatus().getCode());
Assert.assertNotNull(output.getContent());
Assert.assertNotNull(output.getContent().getContentType());
Assert.assertEquals(expectedMimeType, output.getContent().getContentType());
Assert.assertNotNull(output.getContent().getMetadata());
Assert.assertEquals(expectedMimeType, output.getContent().getMetadata().get(Response.CONTENT_TYPE));
}
use of org.apache.nutch.protocol.Protocol in project nutch by apache.
the class IndexingFiltersChecker method getProtocolOutput.
protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
ProtocolFactory factory = new ProtocolFactory(getConf());
Protocol protocol = factory.getProtocol(url);
Text turl = new Text(url);
ProtocolOutput protocolOutput = protocol.getProtocolOutput(turl, datum);
return protocolOutput;
}
Aggregations