use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestHeadingsParseFilter method testExtractHeadingFromNestedNodes.
@Test
public void testExtractHeadingFromNestedNodes() throws IOException, SAXException {
conf.setStrings("headings", "h1", "h2");
HtmlParseFilter filter = new HeadingsParseFilter();
filter.setConf(conf);
Content content = new Content("http://www.foo.com/", "http://www.foo.com/", "".getBytes("UTF8"), "text/html; charset=UTF-8", new Metadata(), conf);
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
ParseResult parseResult = ParseResult.createParseResult("http://www.foo.com/", parse);
HTMLMetaTags metaTags = new HTMLMetaTags();
DOMFragmentParser parser = new DOMFragmentParser();
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
parser.parse(new InputSource(new ByteArrayInputStream(("<html><head><title>test header with span element</title></head><body><h1>header with <span>span element</span></h1></body></html>").getBytes())), node);
parseResult = filter.filter(content, parseResult, metaTags, node);
Assert.assertEquals("The h1 tag must include the content of the inner span node", "header with span element", parseResult.get(content.getUrl()).getData().getParseMeta().get("h1"));
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestAny23ParseFilter method extract.
public String[] extract(String urlString, File file, String contentType) {
try {
System.out.println(urlString);
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
content.setContentType(contentType);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return null;
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestCCParseFilter method pageTest.
public void pageTest(File file, String url, String license, String location, String type) throws Exception {
String contentType = "text/html";
InputStream in = new FileInputStream(file);
ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
byte[] buffer = new byte[1024];
int i;
while ((i = in.read(buffer)) != -1) {
out.write(buffer, 0, i);
}
in.close();
byte[] bytes = out.toByteArray();
Configuration conf = NutchConfiguration.create();
Content content = new Content(url, url, bytes, contentType, new Metadata(), conf);
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
Assert.assertEquals(license, metadata.get("License-Url"));
Assert.assertEquals(location, metadata.get("License-Location"));
Assert.assertEquals(type, metadata.get("Work-Type"));
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestFetcher method testFetch.
@Test
public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
// generate seedlist
ArrayList<String> urls = new ArrayList<String>();
addUrl(urls, "index.html");
addUrl(urls, "pagea.html");
addUrl(urls, "pageb.html");
addUrl(urls, "dup_of_pagea.html");
addUrl(urls, "nested_spider_trap.html");
addUrl(urls, "exception.html");
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
// inject
Injector injector = new Injector(conf);
injector.inject(crawldbPath, urlPath);
// generate
Generator g = new Generator(conf);
Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE, false, false);
long time = System.currentTimeMillis();
// fetch
Fetcher fetcher = new Fetcher(conf);
// Set fetcher.parse to true
conf.setBoolean("fetcher.parse", true);
fetcher.fetch(generatedSegment[0], 1);
time = System.currentTimeMillis() - time;
// verify politeness, time taken should be more than (num_of_pages +1)*delay
int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat("fetcher.server.delay", 5));
Assert.assertTrue(time > minimumTime);
// verify content
Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME), "part-r-00000/data");
@SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
ArrayList<String> handledurls = new ArrayList<String>();
READ_CONTENT: do {
Text key = new Text();
Content value = new Content();
if (!reader.next(key, value))
break READ_CONTENT;
String contentString = new String(value.getContent());
if (contentString.indexOf("Nutch fetcher test page") != -1) {
handledurls.add(key.toString());
}
} while (true);
reader.close();
Collections.sort(urls);
Collections.sort(handledurls);
// verify that enough pages were handled
Assert.assertEquals(urls.size(), handledurls.size());
// verify that correct pages were handled
Assert.assertTrue(handledurls.containsAll(urls));
Assert.assertTrue(urls.containsAll(handledurls));
handledurls.clear();
// verify parse data
Path parseData = new Path(new Path(generatedSegment[0], ParseData.DIR_NAME), "part-r-00000/data");
reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
READ_PARSE_DATA: do {
Text key = new Text();
ParseData value = new ParseData();
if (!reader.next(key, value))
break READ_PARSE_DATA;
// make sure they all contain "nutch.segment.name" and
// "nutch.content.digest"
// keys in parse metadata
Metadata contentMeta = value.getContentMeta();
if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
handledurls.add(key.toString());
}
} while (true);
Collections.sort(handledurls);
Assert.assertEquals(urls.size(), handledurls.size());
Assert.assertTrue(handledurls.containsAll(urls));
Assert.assertTrue(urls.containsAll(handledurls));
}
use of org.apache.nutch.protocol.Content in project nutch by apache.
the class TestIndexerMapReduce method testBinaryContentBase64.
/**
* Test indexing of base64-encoded binary content.
*/
@Test
public void testBinaryContentBase64() {
configuration = NutchConfiguration.create();
configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true);
Charset[] testCharsets = { StandardCharsets.UTF_8, Charset.forName("iso-8859-1"), Charset.forName("iso-8859-2") };
for (Charset charset : testCharsets) {
LOG.info("Testing indexing binary content as base64 for charset {}", charset.name());
String htmlDoc = testHtmlDoc;
if (charset != StandardCharsets.UTF_8) {
htmlDoc = htmlDoc.replaceAll("utf-8", charset.name());
if (charset.name().equalsIgnoreCase("iso-8859-1")) {
// Western-European character set: remove Czech content
htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"cs\".+?\\n", "");
} else if (charset.name().equalsIgnoreCase("iso-8859-2")) {
// Eastern-European character set: remove French content
htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"fr\".+?\\n", "");
}
}
Content content = new Content(testUrl, testUrl, htmlDoc.getBytes(charset), htmlContentType, htmlMeta, configuration);
NutchDocument doc = runIndexer(crawlDatumDbFetched, crawlDatumFetchSuccess, parseText, parseData, content);
assertNotNull("No NutchDocument indexed", doc);
String binaryContentBase64 = (String) doc.getField("binaryContent").getValues().get(0);
LOG.info("binary content (base64): {}", binaryContentBase64);
String binaryContent = new String(Base64.decodeBase64(binaryContentBase64), charset);
LOG.info("binary content (decoded): {}", binaryContent);
assertEquals("Binary content (" + charset + ") not correctly saved as base64", htmlDoc, binaryContent);
}
}
Aggregations