Search in sources :

Example 6 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestHeadingsParseFilter method testExtractHeadingFromNestedNodes.

@Test
public void testExtractHeadingFromNestedNodes() throws IOException, SAXException {
    conf.setStrings("headings", "h1", "h2");
    HtmlParseFilter filter = new HeadingsParseFilter();
    filter.setConf(conf);
    Content content = new Content("http://www.foo.com/", "http://www.foo.com/", "".getBytes("UTF8"), "text/html; charset=UTF-8", new Metadata(), conf);
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    ParseResult parseResult = ParseResult.createParseResult("http://www.foo.com/", parse);
    HTMLMetaTags metaTags = new HTMLMetaTags();
    DOMFragmentParser parser = new DOMFragmentParser();
    DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
    parser.parse(new InputSource(new ByteArrayInputStream(("<html><head><title>test header with span element</title></head><body><h1>header with <span>span element</span></h1></body></html>").getBytes())), node);
    parseResult = filter.filter(content, parseResult, metaTags, node);
    Assert.assertEquals("The h1 tag must include the content of the inner span node", "header with span element", parseResult.get(content.getUrl()).getData().getParseMeta().get("h1"));
}
Also used : InputSource(org.xml.sax.InputSource) Metadata(org.apache.nutch.metadata.Metadata) DOMFragmentParser(org.cyberneko.html.parsers.DOMFragmentParser) HTMLDocumentImpl(org.apache.html.dom.HTMLDocumentImpl) ByteArrayInputStream(java.io.ByteArrayInputStream) Content(org.apache.nutch.protocol.Content) DocumentFragment(org.w3c.dom.DocumentFragment) Test(org.junit.Test)

Example 7 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestAny23ParseFilter method extract.

public String[] extract(String urlString, File file, String contentType) {
    try {
        System.out.println(urlString);
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        content.setContentType(contentType);
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return null;
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) IOException(java.io.IOException) ParseException(org.apache.nutch.parse.ParseException)

Example 8 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestCCParseFilter method pageTest.

public void pageTest(File file, String url, String license, String location, String type) throws Exception {
    String contentType = "text/html";
    InputStream in = new FileInputStream(file);
    ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
    byte[] buffer = new byte[1024];
    int i;
    while ((i = in.read(buffer)) != -1) {
        out.write(buffer, 0, i);
    }
    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();
    Content content = new Content(url, url, bytes, contentType, new Metadata(), conf);
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    Metadata metadata = parse.getData().getParseMeta();
    Assert.assertEquals(license, metadata.get("License-Url"));
    Assert.assertEquals(location, metadata.get("License-Location"));
    Assert.assertEquals(type, metadata.get("Work-Type"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata)

Example 9 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestFetcher method testFetch.

@Test
public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
    // generate seedlist
    ArrayList<String> urls = new ArrayList<String>();
    addUrl(urls, "index.html");
    addUrl(urls, "pagea.html");
    addUrl(urls, "pageb.html");
    addUrl(urls, "dup_of_pagea.html");
    addUrl(urls, "nested_spider_trap.html");
    addUrl(urls, "exception.html");
    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
    // inject
    Injector injector = new Injector(conf);
    injector.inject(crawldbPath, urlPath);
    // generate
    Generator g = new Generator(conf);
    Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE, false, false);
    long time = System.currentTimeMillis();
    // fetch
    Fetcher fetcher = new Fetcher(conf);
    // Set fetcher.parse to true
    conf.setBoolean("fetcher.parse", true);
    fetcher.fetch(generatedSegment[0], 1);
    time = System.currentTimeMillis() - time;
    // verify politeness, time taken should be more than (num_of_pages +1)*delay
    int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat("fetcher.server.delay", 5));
    Assert.assertTrue(time > minimumTime);
    // verify content
    Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME), "part-r-00000/data");
    @SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
    ArrayList<String> handledurls = new ArrayList<String>();
    READ_CONTENT: do {
        Text key = new Text();
        Content value = new Content();
        if (!reader.next(key, value))
            break READ_CONTENT;
        String contentString = new String(value.getContent());
        if (contentString.indexOf("Nutch fetcher test page") != -1) {
            handledurls.add(key.toString());
        }
    } while (true);
    reader.close();
    Collections.sort(urls);
    Collections.sort(handledurls);
    // verify that enough pages were handled
    Assert.assertEquals(urls.size(), handledurls.size());
    // verify that correct pages were handled
    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
    handledurls.clear();
    // verify parse data
    Path parseData = new Path(new Path(generatedSegment[0], ParseData.DIR_NAME), "part-r-00000/data");
    reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
    READ_PARSE_DATA: do {
        Text key = new Text();
        ParseData value = new ParseData();
        if (!reader.next(key, value))
            break READ_PARSE_DATA;
        // make sure they all contain "nutch.segment.name" and
        // "nutch.content.digest"
        // keys in parse metadata
        Metadata contentMeta = value.getContentMeta();
        if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
            handledurls.add(key.toString());
        }
    } while (true);
    Collections.sort(handledurls);
    Assert.assertEquals(urls.size(), handledurls.size());
    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) Metadata(org.apache.nutch.metadata.Metadata) Text(org.apache.hadoop.io.Text) SequenceFile(org.apache.hadoop.io.SequenceFile) ParseData(org.apache.nutch.parse.ParseData) Injector(org.apache.nutch.crawl.Injector) Content(org.apache.nutch.protocol.Content) Generator(org.apache.nutch.crawl.Generator) Test(org.junit.Test)

Example 10 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestIndexerMapReduce method testBinaryContentBase64.

/**
 * Test indexing of base64-encoded binary content.
 */
@Test
public void testBinaryContentBase64() {
    configuration = NutchConfiguration.create();
    configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true);
    Charset[] testCharsets = { StandardCharsets.UTF_8, Charset.forName("iso-8859-1"), Charset.forName("iso-8859-2") };
    for (Charset charset : testCharsets) {
        LOG.info("Testing indexing binary content as base64 for charset {}", charset.name());
        String htmlDoc = testHtmlDoc;
        if (charset != StandardCharsets.UTF_8) {
            htmlDoc = htmlDoc.replaceAll("utf-8", charset.name());
            if (charset.name().equalsIgnoreCase("iso-8859-1")) {
                // Western-European character set: remove Czech content
                htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"cs\".+?\\n", "");
            } else if (charset.name().equalsIgnoreCase("iso-8859-2")) {
                // Eastern-European character set: remove French content
                htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"fr\".+?\\n", "");
            }
        }
        Content content = new Content(testUrl, testUrl, htmlDoc.getBytes(charset), htmlContentType, htmlMeta, configuration);
        NutchDocument doc = runIndexer(crawlDatumDbFetched, crawlDatumFetchSuccess, parseText, parseData, content);
        assertNotNull("No NutchDocument indexed", doc);
        String binaryContentBase64 = (String) doc.getField("binaryContent").getValues().get(0);
        LOG.info("binary content (base64): {}", binaryContentBase64);
        String binaryContent = new String(Base64.decodeBase64(binaryContentBase64), charset);
        LOG.info("binary content (decoded): {}", binaryContent);
        assertEquals("Binary content (" + charset + ") not correctly saved as base64", htmlDoc, binaryContent);
    }
}
Also used : Content(org.apache.nutch.protocol.Content) Charset(java.nio.charset.Charset) Test(org.junit.Test)

Aggregations

Content (org.apache.nutch.protocol.Content)51 Text (org.apache.hadoop.io.Text)30 Parse (org.apache.nutch.parse.Parse)29 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)27 Configuration (org.apache.hadoop.conf.Configuration)23 Metadata (org.apache.nutch.metadata.Metadata)23 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)22 ParseUtil (org.apache.nutch.parse.ParseUtil)20 Test (org.junit.Test)19 Protocol (org.apache.nutch.protocol.Protocol)17 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)16 ParseData (org.apache.nutch.parse.ParseData)8 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)8 ParseResult (org.apache.nutch.parse.ParseResult)7 URL (java.net.URL)6 File (java.io.File)5 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 Outlink (org.apache.nutch.parse.Outlink)5 HashMap (java.util.HashMap)4