Search in sources :

Example 6 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class ParserChecker method run.

public int run(String[] args) throws Exception {
    boolean dumpText = false;
    boolean force = false;
    String contentType = null;
    String url = null;
    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
    if (args.length == 0) {
        LOG.error(usage);
        return (-1);
    }
    // used to simulate the metadata propagated from injection
    HashMap<String, String> metadata = new HashMap<>();
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-forceAs")) {
            force = true;
            contentType = args[++i];
        } else if (args[i].equals("-dumpText")) {
            dumpText = true;
        } else if (args[i].equals("-md")) {
            String k = null, v = null;
            String nextOne = args[++i];
            int firstEquals = nextOne.indexOf("=");
            if (firstEquals != -1) {
                k = nextOne.substring(0, firstEquals);
                v = nextOne.substring(firstEquals + 1);
            } else
                k = nextOne;
            metadata.put(k, v);
        } else if (i != args.length - 1) {
            LOG.error(usage);
            System.exit(-1);
        } else {
            url = URLUtil.toASCII(args[i]);
        }
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("fetching: " + url);
    }
    CrawlDatum cd = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        cd.getMetaData().put(new Text(key), new Text(value));
    }
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Text turl = new Text(url);
    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
    // if the configuration permits, handle redirects until we either run
    // out of allowed redirects or we stop getting redirect statuses.
    int maxRedirects = conf.getInt("http.redirect.max", 0);
    int numRedirects = 0;
    while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
        String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
        LOG.info("Handling redirect to " + newURL);
        protocol = factory.getProtocol(newURL);
        turl = new Text(newURL);
        output = protocol.getProtocolOutput(turl, cd);
        numRedirects++;
    }
    if (!output.getStatus().isSuccess()) {
        System.err.println("Fetch failed with protocol status: " + output.getStatus());
        if (output.getStatus().isRedirect()) {
            System.err.println("Redirect(s) not handled due to configuration.");
            System.err.println("Max Redirects to handle per config: " + maxRedirects);
            System.err.println("Number of Redirects handled: " + numRedirects);
        }
        return (-1);
    }
    Content content = output.getContent();
    if (content == null) {
        LOG.error("No content for " + url);
        return (-1);
    }
    if (force) {
        content.setContentType(contentType);
    } else {
        contentType = content.getContentType();
    }
    if (contentType == null) {
        LOG.error("Failed to determine content type!");
        return (-1);
    }
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    ScoringFilters scfilters = new ScoringFilters(conf);
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, cd, content);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
    ParseResult parseResult = new ParseUtil(conf).parse(content);
    if (parseResult == null) {
        LOG.error("Parsing content failed!");
        return (-1);
    }
    // calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url)));
    if (LOG.isInfoEnabled()) {
        LOG.info("parsing: " + url);
        LOG.info("contentType: " + contentType);
        LOG.info("signature: " + StringUtil.toHexString(signature));
    }
    Parse parse = parseResult.get(turl);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        }
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + turl);
        return -1;
    }
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parse);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
    for (Map.Entry<Text, Parse> entry : parseResult) {
        parse = entry.getValue();
        LOG.info("---------\nUrl\n---------------\n");
        System.out.print(entry.getKey());
        LOG.info("\n---------\nParseData\n---------\n");
        System.out.print(parse.getData().toString());
        if (dumpText) {
            LOG.info("---------\nParseText\n---------\n");
            System.out.print(parse.getText());
        }
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) HashMap(java.util.HashMap) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) Protocol(org.apache.nutch.protocol.Protocol) HashMap(java.util.HashMap) Map(java.util.Map)

Example 7 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class TestIndexReplace method parseAndFilterFile.

/**
 * Run a test file through the Nutch parser and index filters.
 *
 * @param fileName
 * @param conf
 * @return the Nutch document with the replace indexer applied
 */
public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
    NutchDocument doc = new NutchDocument();
    BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
    basicIndexer.setConf(conf);
    Assert.assertNotNull(basicIndexer);
    MetadataIndexer metaIndexer = new MetadataIndexer();
    metaIndexer.setConf(conf);
    Assert.assertNotNull(basicIndexer);
    ReplaceIndexer replaceIndexer = new ReplaceIndexer();
    replaceIndexer.setConf(conf);
    Assert.assertNotNull(replaceIndexer);
    try {
        String urlString = "file:" + sampleDir + fileSeparator + fileName;
        Text text = new Text(urlString);
        CrawlDatum crawlDatum = new CrawlDatum();
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(text, crawlDatum).getContent();
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        crawlDatum.setFetchTime(100L);
        Inlinks inlinks = new Inlinks();
        doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
        doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
        doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return doc;
}
Also used : NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) MetadataIndexer(org.apache.nutch.indexer.metadata.MetadataIndexer) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) BasicIndexingFilter(org.apache.nutch.indexer.basic.BasicIndexingFilter) Protocol(org.apache.nutch.protocol.Protocol)

Example 8 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class TestMetatagParser method parseMeta.

public Metadata parseMeta(String fileName, Configuration conf) {
    Metadata metadata = null;
    try {
        String urlString = "file:" + sampleDir + fileSeparator + fileName;
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        metadata = parse.getData().getParseMeta();
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return metadata;
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol)

Example 9 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class TestSWFParser method testIt.

@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
        Assert.assertTrue(sampleTexts[i].equals(text));
    }
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Example 10 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class TestFeedParser method testIt.

/**
 * <p>
 * The test method: tests out the following 2 asserts:
 * </p>
 *
 * <ul>
 * <li>There are 3 outlinks read from the sample rss file</li>
 * <li>The 3 outlinks read are in fact the correct outlinks from the sample
 * file</li>
 * </ul>
 */
@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
        // check that there are 2 outlinks:
        // unlike the original parse-rss
        // tika ignores the URL and description of the channel
        // http://test.channel.com
        // http://www-scf.usc.edu/~mattmann/
        // http://www.nutch.org
        ParseData theParseData = parse.getData();
        Outlink[] theOutlinks = theParseData.getOutlinks();
        Assert.assertTrue("There aren't 2 outlinks read!", theOutlinks.length == 2);
        // now check to make sure that those are the two outlinks
        boolean hasLink1 = false, hasLink2 = false;
        for (int j = 0; j < theOutlinks.length; j++) {
            if (theOutlinks[j].getToUrl().equals("http://www-scf.usc.edu/~mattmann/")) {
                hasLink1 = true;
            }
            if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
                hasLink2 = true;
            }
        }
        if (!hasLink1 || !hasLink2) {
            Assert.fail("Outlinks read from sample rss file are not correct!");
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Aggregations

Text (org.apache.hadoop.io.Text)16 Protocol (org.apache.nutch.protocol.Protocol)16 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)13 Content (org.apache.nutch.protocol.Content)13 Parse (org.apache.nutch.parse.Parse)11 ParseUtil (org.apache.nutch.parse.ParseUtil)11 Configuration (org.apache.hadoop.conf.Configuration)7 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)7 Test (org.junit.Test)7 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)4 IOException (java.io.IOException)2 Map (java.util.Map)2 Metadata (org.apache.nutch.metadata.Metadata)2 BaseRobotRules (crawlercommons.robots.BaseRobotRules)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 MalformedURLException (java.net.MalformedURLException)1 URL (java.net.URL)1 HashMap (java.util.HashMap)1