Search in sources :

Example 16 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class MimeTypeIndexingFilterTest method setUp.

@Before
public void setUp() throws Exception {
    for (int i = 0; i < MIME_TYPES.length; i++) {
        Metadata metadata = new Metadata();
        metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
        ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
        parses[i] = parse;
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) Before(org.junit.Before)

Example 17 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class HtmlParser method getParse.

public ParseResult getParse(Content content) {
    HTMLMetaTags metaTags = new HTMLMetaTags();
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();
    // parse the content
    DocumentFragment root;
    try {
        byte[] contentInOctets = content.getContent();
        InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
        EncodingDetector detector = new EncodingDetector(conf);
        detector.autoDetectClues(content, true);
        detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
        String encoding = detector.guessEncoding(content, defaultCharEncoding);
        metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
        metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
        input.setEncoding(encoding);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Parsing...");
        }
        root = parse(input);
    } catch (IOException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (DOMException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (SAXException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) {
        LOG.error("Error: ", e);
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    // populate Nutch metadata with HTML meta directives
    metadata.addAll(metaTags.getGeneralTags());
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = utils.getBase(root);
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                baseTag = base;
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links...");
        }
        utils.getOutlinks(baseTag, l, root);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}
Also used : MalformedURLException(java.net.MalformedURLException) InputSource(org.xml.sax.InputSource) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) SAXException(org.xml.sax.SAXException) ParseStatus(org.apache.nutch.parse.ParseStatus) DOMException(org.w3c.dom.DOMException) EncodingDetector(org.apache.nutch.util.EncodingDetector) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) IOException(java.io.IOException) DOMException(org.w3c.dom.DOMException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) Map(java.util.Map)

Example 18 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestMetatagParser method testIt.

@Test
public /**
 * test defaults: keywords and description
 */
void testIt() {
    Configuration conf = NutchConfiguration.create();
    // check that we get the same values
    Metadata parseMeta = parseMeta(sampleFile, conf);
    Assert.assertEquals(description, parseMeta.get("metatag.description"));
    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) Metadata(org.apache.nutch.metadata.Metadata) Test(org.junit.Test)

Example 19 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class SmallStack method main.

/**
 * Arguments are: 0. Name of input SWF file.
 */
public static void main(String[] args) throws IOException {
    FileInputStream in = new FileInputStream(args[0]);
    byte[] buf = new byte[in.available()];
    in.read(buf);
    in.close();
    SWFParser parser = new SWFParser();
    ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0], buf, "application/x-shockwave-flash", new Metadata(), NutchConfiguration.create()));
    Parse p = parseResult.get("file:" + args[0]);
    System.out.println("Parse Text:");
    System.out.println(p.getText());
    System.out.println("Parse Data:");
    System.out.println(p.getData());
}
Also used : ParseResult(org.apache.nutch.parse.ParseResult) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) FileInputStream(java.io.FileInputStream)

Example 20 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestRTFParser method testIt.

@Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    String text = parse.getText();
    Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();
    Assert.assertEquals("test rft document", title);
    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

Metadata (org.apache.nutch.metadata.Metadata)42 Configuration (org.apache.hadoop.conf.Configuration)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 ParseData (org.apache.nutch.parse.ParseData)19 Content (org.apache.nutch.protocol.Content)18 Test (org.junit.Test)17 Text (org.apache.hadoop.io.Text)16 Parse (org.apache.nutch.parse.Parse)16 ParseImpl (org.apache.nutch.parse.ParseImpl)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)14 Inlinks (org.apache.nutch.crawl.Inlinks)11 Outlink (org.apache.nutch.parse.Outlink)10 ParseStatus (org.apache.nutch.parse.ParseStatus)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)7 ParseResult (org.apache.nutch.parse.ParseResult)7 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 File (java.io.File)4 ArrayList (java.util.ArrayList)4 ParseUtil (org.apache.nutch.parse.ParseUtil)4