Search in sources :

Example 1 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestProtocolHttp method fetchPage.

/**
 * Fetches the specified <code>page</code> from the local Jetty server and
 * checks whether the HTTP response status code matches with the expected
 * code. Also use jsp pages for redirection.
 *
 * @param page
 *          Page to be fetched.
 * @param expectedCode
 *          HTTP response status code expected while fetching the page.
 */
private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    CrawlDatum crawlDatum = new CrawlDatum();
    Response response = http.getResponse(url, crawlDatum, true);
    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum);
    Content content = out.getContent();
    assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
    if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) {
        assertEquals("ContentType " + url, "text/html", content.getContentType());
    }
}
Also used : Response(org.apache.nutch.net.protocols.Response) ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) URL(java.net.URL)

Example 2 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestRTFParser method testIt.

@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    String text = parse.getText();
    Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();
    Assert.assertEquals("test rft document", title);
    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 3 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class EncodingDetector method main.

public static void main(String[] args) throws IOException {
    if (args.length != 1) {
        System.err.println("Usage: EncodingDetector <file>");
        System.exit(1);
    }
    Configuration conf = NutchConfiguration.create();
    EncodingDetector detector = new EncodingDetector(NutchConfiguration.create());
    // do everything as bytes; don't want any conversion
    @SuppressWarnings("resource") BufferedInputStream istr = new BufferedInputStream(new FileInputStream(args[0]));
    ByteArrayOutputStream ostr = new ByteArrayOutputStream();
    byte[] bytes = new byte[1000];
    boolean more = true;
    while (more) {
        int len = istr.read(bytes);
        if (len < bytes.length) {
            more = false;
            if (len > 0) {
                ostr.write(bytes, 0, len);
            }
        } else {
            ostr.write(bytes);
        }
    }
    byte[] data = ostr.toByteArray();
    // make a fake Content
    Content content = new Content("", "", data, "text/html", new Metadata(), conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, conf.get("parser.character.encoding.default"));
    System.out.println("Guessed encoding: " + encoding);
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) BufferedInputStream(java.io.BufferedInputStream) Content(org.apache.nutch.protocol.Content) Metadata(org.apache.nutch.metadata.Metadata) ByteArrayOutputStream(java.io.ByteArrayOutputStream) FileInputStream(java.io.FileInputStream)

Example 4 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class FeedParser method main.

/**
 * Runs a command line version of this {@link Parser}.
 *
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 *
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        System.err.println("Usage: FeedParser <feed>");
        System.exit(1);
    }
    String name = args[0];
    String url = "file:" + name;
    Configuration conf = NutchConfiguration.create();
    FeedParser parser = new FeedParser();
    parser.setConf(conf);
    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    in.close();
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes, "application/rss+xml", new Metadata(), conf));
    for (Entry<Text, Parse> entry : parseResult) {
        System.out.println("key: " + entry.getKey());
        Parse parse = entry.getValue();
        System.out.println("data: " + parse.getData());
        System.out.println("text: " + parse.getText() + "\n");
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) ParseResult(org.apache.nutch.parse.ParseResult) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream) Content(org.apache.nutch.protocol.Content) SyndContent(com.rometools.rome.feed.synd.SyndContent) File(java.io.File)

Example 5 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class FeedParser method addToMap.

private void addToMap(ParseResult parseResult, SyndFeed feed, String feedLink, SyndEntry entry, Content content) {
    String link = entry.getLink(), text = null, title = null;
    Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
    Parse parse = null;
    SyndContent description = entry.getDescription();
    try {
        link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
        if (link != null)
            link = filters.filter(link);
    } catch (Exception e) {
        e.printStackTrace();
        return;
    }
    if (link == null)
        return;
    title = stripTags(entry.getTitleEx());
    if (feedLink != null)
        parseMeta.set("feed", feedLink);
    addFields(parseMeta, contentMeta, feed, entry);
    // some item descriptions contain markup text in them,
    // so we temporarily set their content-type to parse them
    // with another plugin
    String contentType = contentMeta.get(Response.CONTENT_TYPE);
    if (description != null)
        text = description.getValue();
    if (text == null) {
        List<?> contents = entry.getContents();
        StringBuilder buf = new StringBuilder();
        for (Object syndContent : contents) {
            buf.append(((SyndContent) syndContent).getValue());
        }
        text = buf.toString();
    }
    try {
        Parser parser = parserFactory.getParsers(contentType, link)[0];
        parse = parser.getParse(new Content(link, link, text.getBytes(), contentType, contentMeta, conf)).get(link);
    } catch (ParserNotFound e) {
    /* ignore */
    }
    if (parse != null) {
        ParseData data = parse.getData();
        data.getContentMeta().remove(Response.CONTENT_TYPE);
        mergeMetadata(data.getParseMeta(), parseMeta);
        parseResult.put(link, new ParseText(parse.getText()), new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data.getContentMeta(), data.getParseMeta()));
    } else {
        contentMeta.remove(Response.CONTENT_TYPE);
        parseResult.put(link, new ParseText(text), new ParseData(ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, parseMeta));
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParserNotFound(org.apache.nutch.parse.ParserNotFound) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) Parser(org.apache.nutch.parse.Parser) ParseText(org.apache.nutch.parse.ParseText) SyndContent(com.rometools.rome.feed.synd.SyndContent) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) SyndContent(com.rometools.rome.feed.synd.SyndContent)

Aggregations

Content (org.apache.nutch.protocol.Content)51 Text (org.apache.hadoop.io.Text)30 Parse (org.apache.nutch.parse.Parse)29 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)27 Configuration (org.apache.hadoop.conf.Configuration)23 Metadata (org.apache.nutch.metadata.Metadata)23 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)22 ParseUtil (org.apache.nutch.parse.ParseUtil)20 Test (org.junit.Test)19 Protocol (org.apache.nutch.protocol.Protocol)17 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)16 ParseData (org.apache.nutch.parse.ParseData)8 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)8 ParseResult (org.apache.nutch.parse.ParseResult)7 URL (java.net.URL)6 File (java.io.File)5 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 Outlink (org.apache.nutch.parse.Outlink)5 HashMap (java.util.HashMap)4