Search in sources :

Example 6 with ProtocolFactory

use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.

the class TestIndexReplace method parseAndFilterFile.

/**
 * Run a test file through the Nutch parser and index filters.
 *
 * @param fileName
 * @param conf
 * @return the Nutch document with the replace indexer applied
 */
public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
    NutchDocument doc = new NutchDocument();
    BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
    basicIndexer.setConf(conf);
    Assert.assertNotNull(basicIndexer);
    MetadataIndexer metaIndexer = new MetadataIndexer();
    metaIndexer.setConf(conf);
    Assert.assertNotNull(basicIndexer);
    ReplaceIndexer replaceIndexer = new ReplaceIndexer();
    replaceIndexer.setConf(conf);
    Assert.assertNotNull(replaceIndexer);
    try {
        String urlString = "file:" + sampleDir + fileSeparator + fileName;
        Text text = new Text(urlString);
        CrawlDatum crawlDatum = new CrawlDatum();
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(text, crawlDatum).getContent();
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        crawlDatum.setFetchTime(100L);
        Inlinks inlinks = new Inlinks();
        doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
        doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
        doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return doc;
}
Also used : NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) MetadataIndexer(org.apache.nutch.indexer.metadata.MetadataIndexer) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) BasicIndexingFilter(org.apache.nutch.indexer.basic.BasicIndexingFilter) Protocol(org.apache.nutch.protocol.Protocol)

Example 7 with ProtocolFactory

use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.

the class TestMetatagParser method parseMeta.

public Metadata parseMeta(String fileName, Configuration conf) {
    Metadata metadata = null;
    try {
        String urlString = "file:" + sampleDir + fileSeparator + fileName;
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        metadata = parse.getData().getParseMeta();
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return metadata;
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol)

Example 8 with ProtocolFactory

use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.

the class TestSWFParser method testIt.

@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
        Assert.assertTrue(sampleTexts[i].equals(text));
    }
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Example 9 with ProtocolFactory

use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.

the class TestFeedParser method testIt.

/**
 * <p>
 * The test method: tests out the following 2 asserts:
 * </p>
 *
 * <ul>
 * <li>There are 3 outlinks read from the sample rss file</li>
 * <li>The 3 outlinks read are in fact the correct outlinks from the sample
 * file</li>
 * </ul>
 */
@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
        // check that there are 2 outlinks:
        // unlike the original parse-rss
        // tika ignores the URL and description of the channel
        // http://test.channel.com
        // http://www-scf.usc.edu/~mattmann/
        // http://www.nutch.org
        ParseData theParseData = parse.getData();
        Outlink[] theOutlinks = theParseData.getOutlinks();
        Assert.assertTrue("There aren't 2 outlinks read!", theOutlinks.length == 2);
        // now check to make sure that those are the two outlinks
        boolean hasLink1 = false, hasLink2 = false;
        for (int j = 0; j < theOutlinks.length; j++) {
            if (theOutlinks[j].getToUrl().equals("http://www-scf.usc.edu/~mattmann/")) {
                hasLink1 = true;
            }
            if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
                hasLink2 = true;
            }
        }
        if (!hasLink1 || !hasLink2) {
            Assert.fail("Outlinks read from sample rss file are not correct!");
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Example 10 with ProtocolFactory

use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.

the class TestImageMetadata method testIt.

@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        Configuration conf = NutchConfiguration.create();
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
        Assert.assertEquals("121", parse.getData().getMeta("width"));
        Assert.assertEquals("48", parse.getData().getMeta("height"));
    }
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Aggregations

Text (org.apache.hadoop.io.Text)15 Protocol (org.apache.nutch.protocol.Protocol)15 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)13 Content (org.apache.nutch.protocol.Content)12 Parse (org.apache.nutch.parse.Parse)11 ParseUtil (org.apache.nutch.parse.ParseUtil)11 Configuration (org.apache.hadoop.conf.Configuration)7 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)7 Test (org.junit.Test)7 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)3 Map (java.util.Map)2 Metadata (org.apache.nutch.metadata.Metadata)2 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 Inlinks (org.apache.nutch.crawl.Inlinks)1 NutchDocument (org.apache.nutch.indexer.NutchDocument)1 BasicIndexingFilter (org.apache.nutch.indexer.basic.BasicIndexingFilter)1