Search in sources :

Example 36 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestMetadataScoringFilter method passScoreBeforeParsing.

@Test
public void passScoreBeforeParsing() {
    Configuration conf = NutchConfiguration.create();
    conf.set(MetadataScoringFilter.METADATA_DATUM, "parent,depth");
    MetadataScoringFilter metadataScoringFilter = new MetadataScoringFilter();
    metadataScoringFilter.setConf(conf);
    CrawlDatum crawlDatum = new CrawlDatum();
    Text from = new Text("https://nutch.apache.org/");
    String PARENT = "parent";
    String DEPTH = "depth";
    String parentMD = "https://nutch.apache.org/";
    String depthMD = "1";
    crawlDatum.getMetaData().put(new Text(PARENT), new Text(parentMD));
    crawlDatum.getMetaData().put(new Text(DEPTH), new Text(depthMD));
    Content content = new Content();
    metadataScoringFilter.passScoreBeforeParsing(from, crawlDatum, content);
    Assert.assertEquals(parentMD, content.getMetadata().get(PARENT));
    Assert.assertEquals(depthMD, content.getMetadata().get(DEPTH));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Test(org.junit.Test)

Example 37 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class HtmlParser method main.

public static void main(String[] args) throws Exception {
    String name = args[0];
    String url = "file:" + name;
    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    @SuppressWarnings("resource") DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    Configuration conf = NutchConfiguration.create();
    HtmlParser parser = new HtmlParser();
    parser.setConf(conf);
    Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Parse(org.apache.nutch.parse.Parse) Content(org.apache.nutch.protocol.Content) Metadata(org.apache.nutch.metadata.Metadata) DataInputStream(java.io.DataInputStream) File(java.io.File) FileInputStream(java.io.FileInputStream)

Example 38 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestDOMContentUtils method setup.

@Before
public void setup() throws Exception {
    conf = NutchConfiguration.create();
    utils = new DOMContentUtils(conf);
    conf.set("plugin.includes", "parse-tika");
    TikaParser parser = new TikaParser();
    parser.setConf(conf);
    for (int i = 0; i < testPages.length; i++) {
        try {
            String url = testBaseHrefs[i];
            testBaseHrefURLs[i] = new URL(url);
            Content content = new Content(url, url, testPages[i].getBytes(StandardCharsets.UTF_8), "text/html", new Metadata(), conf);
            HTMLDocumentImpl doc = new HTMLDocumentImpl();
            doc.setErrorChecking(false);
            DocumentFragment root = doc.createDocumentFragment();
            parser.getParse(content, doc, root);
            testDOMs[i] = root;
        } catch (Exception e) {
            Assert.assertTrue("caught exception: " + e, false);
        }
    }
    answerOutlinks = new Outlink[][] { { new Outlink("http://www.nutch.org", "anchor") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/bot.html", "bots") }, { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this") }, { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/1", "1"), new Outlink("http://www.nutch.org/docs/2", "2") }, { new Outlink("http://www.nutch.org/frames/top.html", ""), new Outlink("http://www.nutch.org/frames/left.html", ""), new Outlink("http://www.nutch.org/frames/invalid.html", ""), new Outlink("http://www.nutch.org/frames/right.html", "") }, { new Outlink("http://www.nutch.org/maps/logo.gif", ""), new Outlink("http://www.nutch.org/index.html", ""), new Outlink("http://www.nutch.org/maps/#bottom", ""), new Outlink("http://www.nutch.org/bot.html", ""), new Outlink("http://www.nutch.org/docs/index.html", "") }, { new Outlink("http://www.nutch.org/index.html", "whitespace test") }, {}, {}, {}, { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, { // this is tricky - see RFC3986 section 5.4.1 example 7
    new Outlink("http://www.nutch.org/g", "anchor1"), new Outlink("http://www.nutch.org/g?y#s", "anchor2"), new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") }, {} };
}
Also used : Outlink(org.apache.nutch.parse.Outlink) HTMLDocumentImpl(org.apache.html.dom.HTMLDocumentImpl) Content(org.apache.nutch.protocol.Content) Metadata(org.apache.nutch.metadata.Metadata) URL(java.net.URL) DocumentFragment(org.w3c.dom.DocumentFragment) Before(org.junit.Before)

Example 39 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestJSParseFilter method getOutlinks.

public Outlink[] getOutlinks(String sampleFile) throws ProtocolException, ParseException, IOException {
    String urlString;
    Parse parse;
    urlString = "file:" + sampleDir + fileSeparator + sampleFile;
    LOG.info("Parsing {}", urlString);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    LOG.info(parse.getData().toString());
    return parse.getData().getOutlinks();
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol)

Example 40 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestSWFParser method testIt.

@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
        Assert.assertTrue(sampleTexts[i].equals(text));
    }
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Aggregations

Content (org.apache.nutch.protocol.Content)51 Text (org.apache.hadoop.io.Text)30 Parse (org.apache.nutch.parse.Parse)29 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)27 Configuration (org.apache.hadoop.conf.Configuration)23 Metadata (org.apache.nutch.metadata.Metadata)23 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)22 ParseUtil (org.apache.nutch.parse.ParseUtil)20 Test (org.junit.Test)19 Protocol (org.apache.nutch.protocol.Protocol)17 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)16 ParseData (org.apache.nutch.parse.ParseData)8 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)8 ParseResult (org.apache.nutch.parse.ParseResult)7 URL (java.net.URL)6 File (java.io.File)5 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 Outlink (org.apache.nutch.parse.Outlink)5 HashMap (java.util.HashMap)4