Search in sources :

Example 1 with ProtocolFactory

use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.

the class TestAny23ParseFilter method extract.

public String[] extract(String urlString, File file, String contentType) {
    try {
        System.out.println(urlString);
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        content.setContentType(contentType);
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return null;
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) ParseException(org.apache.nutch.parse.ParseException) IOException(java.io.IOException)

Example 2 with ProtocolFactory

use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.

the class TestExtParser method setUp.

@Before
protected void setUp() throws ProtocolException, IOException {
    // prepare a temp file with expectedText as its content
    // This system property is defined in ./src/plugin/build-plugin.xml
    String path = System.getProperty("test.data");
    if (path != null) {
        File tempDir = new File(path);
        if (!tempDir.exists())
            tempDir.mkdir();
        tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt", tempDir);
    } else {
        // otherwise in java.io.tmpdir
        tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt");
    }
    urlString = tempFile.toURI().toURL().toString();
    FileOutputStream fos = new FileOutputStream(tempFile);
    fos.write(expectedText.getBytes());
    fos.close();
    // get nutch content
    Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    protocol = null;
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) FileOutputStream(java.io.FileOutputStream) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) File(java.io.File) Before(org.junit.Before)

Example 3 with ProtocolFactory

use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.

the class TestRTFParser method testIt.

@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    String text = parse.getText();
    Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();
    Assert.assertEquals("test rft document", title);
    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 4 with ProtocolFactory

use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.

the class TestZipParser method testIt.

@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(content.getUrl());
        Assert.assertTrue("Extracted text does not start with <" + expectedText + ">: <" + parse.getText() + ">", parse.getText().startsWith(expectedText));
    }
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Example 5 with ProtocolFactory

use of org.apache.nutch.protocol.ProtocolFactory in project nutch by apache.

the class ParserChecker method run.

public int run(String[] args) throws Exception {
    boolean dumpText = false;
    boolean force = false;
    String contentType = null;
    String url = null;
    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
    if (args.length == 0) {
        LOG.error(usage);
        return (-1);
    }
    // used to simulate the metadata propagated from injection
    HashMap<String, String> metadata = new HashMap<>();
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-forceAs")) {
            force = true;
            contentType = args[++i];
        } else if (args[i].equals("-dumpText")) {
            dumpText = true;
        } else if (args[i].equals("-md")) {
            String k = null, v = null;
            String nextOne = args[++i];
            int firstEquals = nextOne.indexOf("=");
            if (firstEquals != -1) {
                k = nextOne.substring(0, firstEquals);
                v = nextOne.substring(firstEquals + 1);
            } else
                k = nextOne;
            metadata.put(k, v);
        } else if (i != args.length - 1) {
            LOG.error(usage);
            System.exit(-1);
        } else {
            url = URLUtil.toASCII(args[i]);
        }
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("fetching: " + url);
    }
    CrawlDatum cd = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        cd.getMetaData().put(new Text(key), new Text(value));
    }
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Text turl = new Text(url);
    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
    // if the configuration permits, handle redirects until we either run
    // out of allowed redirects or we stop getting redirect statuses.
    int maxRedirects = conf.getInt("http.redirect.max", 0);
    int numRedirects = 0;
    while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
        String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
        LOG.info("Handling redirect to " + newURL);
        protocol = factory.getProtocol(newURL);
        turl = new Text(newURL);
        output = protocol.getProtocolOutput(turl, cd);
        numRedirects++;
    }
    if (!output.getStatus().isSuccess()) {
        System.err.println("Fetch failed with protocol status: " + output.getStatus());
        if (output.getStatus().isRedirect()) {
            System.err.println("Redirect(s) not handled due to configuration.");
            System.err.println("Max Redirects to handle per config: " + maxRedirects);
            System.err.println("Number of Redirects handled: " + numRedirects);
        }
        return (-1);
    }
    Content content = output.getContent();
    if (content == null) {
        LOG.error("No content for " + url);
        return (-1);
    }
    if (force) {
        content.setContentType(contentType);
    } else {
        contentType = content.getContentType();
    }
    if (contentType == null) {
        LOG.error("Failed to determine content type!");
        return (-1);
    }
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    ScoringFilters scfilters = new ScoringFilters(conf);
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, cd, content);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
    ParseResult parseResult = new ParseUtil(conf).parse(content);
    if (parseResult == null) {
        LOG.error("Parsing content failed!");
        return (-1);
    }
    // calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url)));
    if (LOG.isInfoEnabled()) {
        LOG.info("parsing: " + url);
        LOG.info("contentType: " + contentType);
        LOG.info("signature: " + StringUtil.toHexString(signature));
    }
    Parse parse = parseResult.get(turl);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        }
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + turl);
        return -1;
    }
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parse);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
    for (Map.Entry<Text, Parse> entry : parseResult) {
        parse = entry.getValue();
        LOG.info("---------\nUrl\n---------------\n");
        System.out.print(entry.getKey());
        LOG.info("\n---------\nParseData\n---------\n");
        System.out.print(parse.getData().toString());
        if (dumpText) {
            LOG.info("---------\nParseText\n---------\n");
            System.out.print(parse.getText());
        }
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) HashMap(java.util.HashMap) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) Protocol(org.apache.nutch.protocol.Protocol) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

Text (org.apache.hadoop.io.Text)15 Protocol (org.apache.nutch.protocol.Protocol)15 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)13 Content (org.apache.nutch.protocol.Content)12 Parse (org.apache.nutch.parse.Parse)11 ParseUtil (org.apache.nutch.parse.ParseUtil)11 Configuration (org.apache.hadoop.conf.Configuration)7 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)7 Test (org.junit.Test)7 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)3 Map (java.util.Map)2 Metadata (org.apache.nutch.metadata.Metadata)2 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 Inlinks (org.apache.nutch.crawl.Inlinks)1 NutchDocument (org.apache.nutch.indexer.NutchDocument)1 BasicIndexingFilter (org.apache.nutch.indexer.basic.BasicIndexingFilter)1