Search in sources :

Example 21 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestZipParser method testIt.

@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(content.getUrl());
        Assert.assertTrue("Extracted text does not start with <" + expectedText + ">: <" + parse.getText() + ">", parse.getText().startsWith(expectedText));
    }
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Example 22 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestRegexParseFilter method testPositiveFilter.

public void testPositiveFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
    conf.set("parsefilter.regex.file", file);
    RegexParseFilter filter = new RegexParseFilter();
    filter.setConf(conf);
    String url = "http://nutch.apache.org/";
    String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
    Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
    ParseResult result = ParseResult.createParseResult(url, parse);
    result = filter.filter(content, result, null, null);
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("true", meta.get("first"));
    assertEquals("true", meta.get("second"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseResult(org.apache.nutch.parse.ParseResult) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 23 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestRegexParseFilter method testNegativeFilter.

public void testNegativeFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
    conf.set("parsefilter.regex.file", file);
    RegexParseFilter filter = new RegexParseFilter();
    filter.setConf(conf);
    String url = "http://nutch.apache.org/";
    String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
    Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
    ParseResult result = ParseResult.createParseResult(url, parse);
    result = filter.filter(content, result, null, null);
    Metadata meta = parse.getData().getParseMeta();
    assertEquals("false", meta.get("first"));
    assertEquals("false", meta.get("second"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseResult(org.apache.nutch.parse.ParseResult) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 24 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class Ftp method main.

/**
 * For debugging.
 * @param args run with no args for help
 * @throws Exception if there is an error running this program
 */
public static void main(String[] args) throws Exception {
    int timeout = Integer.MIN_VALUE;
    int maxContentLength = Integer.MIN_VALUE;
    @SuppressWarnings("unused") String logLevel = "info";
    boolean followTalk = false;
    boolean keepConnection = false;
    boolean dumpContent = false;
    String urlString = null;
    String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
    if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);
    }
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-logLevel")) {
            logLevel = args[++i];
        } else if (args[i].equals("-followTalk")) {
            followTalk = true;
        } else if (args[i].equals("-keepConnection")) {
            keepConnection = true;
        } else if (args[i].equals("-timeout")) {
            timeout = Integer.parseInt(args[++i]) * 1000;
        } else if (args[i].equals("-maxContentLength")) {
            maxContentLength = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-dumpContent")) {
            dumpContent = true;
        } else if (i != args.length - 1) {
            System.err.println(usage);
            System.exit(-1);
        } else {
            urlString = args[i];
        }
    }
    Ftp ftp = new Ftp();
    ftp.setFollowTalk(followTalk);
    ftp.setKeepConnection(keepConnection);
    if (// set timeout
    timeout != Integer.MIN_VALUE)
        ftp.setTimeout(timeout);
    if (// set maxContentLength
    maxContentLength != Integer.MIN_VALUE)
        ftp.setMaxContentLength(maxContentLength);
    // set log level
    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
    Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
    System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED));
    if (dumpContent) {
        System.out.print(new String(content.getContent()));
    }
    ftp = null;
}
Also used : Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text)

Example 25 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class File method main.

/**
 * Quick way for running this class. Useful for debugging.
 * @param args run with no args to print help
 * @throws Exception if there is a fatal error running this class
 * with the given input
 */
public static void main(String[] args) throws Exception {
    int maxContentLength = Integer.MIN_VALUE;
    boolean dumpContent = false;
    String urlString = null;
    String usage = "Usage: File [-maxContentLength L] [-dumpContent] url";
    if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);
    }
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-maxContentLength")) {
            maxContentLength = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-dumpContent")) {
            dumpContent = true;
        } else if (i != args.length - 1) {
            System.err.println(usage);
            System.exit(-1);
        } else
            urlString = args[i];
    }
    File file = new File();
    file.setConf(NutchConfiguration.create());
    if (// set maxContentLength
    maxContentLength != Integer.MIN_VALUE)
        file.setMaxContentLength(maxContentLength);
    // set log level
    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
    ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new CrawlDatum());
    Content content = output.getContent();
    System.err.println("URL: " + content.getUrl());
    System.err.println("Status: " + output.getStatus());
    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
    System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED));
    String redirectLocation = content.getMetadata().get("Location");
    if (redirectLocation != null) {
        System.err.println("Location: " + redirectLocation);
    }
    if (dumpContent) {
        System.out.print(new String(content.getContent()));
    }
    file = null;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text)

Aggregations

Content (org.apache.nutch.protocol.Content)51 Text (org.apache.hadoop.io.Text)30 Parse (org.apache.nutch.parse.Parse)29 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)27 Configuration (org.apache.hadoop.conf.Configuration)23 Metadata (org.apache.nutch.metadata.Metadata)23 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)22 ParseUtil (org.apache.nutch.parse.ParseUtil)20 Test (org.junit.Test)19 Protocol (org.apache.nutch.protocol.Protocol)17 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)16 ParseData (org.apache.nutch.parse.ParseData)8 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)8 ParseResult (org.apache.nutch.parse.ParseResult)7 URL (java.net.URL)6 File (java.io.File)5 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 Outlink (org.apache.nutch.parse.Outlink)5 HashMap (java.util.HashMap)4