Search in sources :

Example 26 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class SmallStack method getParse.

@Override
public ParseResult getParse(Content content) {
    String text = null;
    Vector<Outlink> outlinks = new Vector<>();
    try {
        byte[] raw = content.getContent();
        String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
        if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), getConf());
        }
        ExtractText extractor = new ExtractText();
        // TagParser implements SWFTags and drives a SWFTagTypes interface
        TagParser parser = new TagParser(extractor);
        // use this instead to debug the file
        // TagParser parser = new TagParser( new SWFTagDumper(true, true) );
        // SWFReader reads an input file and drives a SWFTags interface
        SWFReader reader = new SWFReader(parser, new InStream(raw));
        // read the input SWF file and pass it through the interface pipeline
        reader.readFile();
        text = extractor.getText();
        String atext = extractor.getActionText();
        if (atext != null && atext.length() > 0)
            text += "\n--------\n" + atext;
        // harvest potential outlinks
        String[] links = extractor.getUrls();
        for (int i = 0; i < links.length; i++) {
            Outlink out = new Outlink(links[i], "");
            outlinks.add(out);
        }
        Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
        if (olinks != null)
            for (int i = 0; i < olinks.length; i++) {
                outlinks.add(olinks[i]);
            }
    } catch (Exception e) {
        // run time exception
        LOG.error("Error, runtime exception: ", e);
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    }
    if (text == null)
        text = "";
    Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) TagParser(com.anotherbigidea.flash.readers.TagParser) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) SWFReader(com.anotherbigidea.flash.readers.SWFReader) ParseData(org.apache.nutch.parse.ParseData) InStream(com.anotherbigidea.io.InStream) ParseImpl(org.apache.nutch.parse.ParseImpl) Vector(java.util.Vector)

Example 27 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TikaParser method getParse.

@SuppressWarnings("deprecation")
public ParseResult getParse(Content content) {
    String mimeType = content.getContentType();
    boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
    String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();
    if (parser == null) {
        String message = "Can't retrieve Tika parser for mime-type " + mimeType;
        LOG.error(message);
        return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(content.getUrl(), getConf());
    }
    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);
    Metadata tikamd = new Metadata();
    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    ContentHandler domHandler;
    // Check whether to use Tika's BoilerplateContentHandler
    if (useBoilerpipe) {
        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler) new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
        bpHandler.setIncludeMarkup(true);
        domHandler = (ContentHandler) bpHandler;
    } else {
        DOMBuilder domBuilder = new DOMBuilder(doc, root);
        domBuilder.setUpperCaseElementNames(upperCaseElementNames);
        domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
        domHandler = (ContentHandler) domBuilder;
    }
    LinkContentHandler linkContentHandler = new LinkContentHandler();
    ParseContext context = new ParseContext();
    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
    if (HTMLMapper != null)
        context.set(HtmlMapper.class, HTMLMapper);
    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
    try {
        parser.parse(new ByteArrayInputStream(raw), (ContentHandler) teeContentHandler, tikamd, context);
    } catch (Exception e) {
        LOG.error("Error parsing " + content.getUrl(), e);
        return new ParseStatus(ParseStatus.FAILED, e.getMessage()).getEmptyParseResult(content.getUrl(), getConf());
    }
    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = tikamd.get("Content-Location");
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links (base URL = {}) ...", baseTag);
        }
        // pre-1233 outlink extraction
        // utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
        // Get outlinks from Tika
        List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
        utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
        if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
            continue;
        String[] values = tikamd.getValues(tikaMDName);
        for (String v : values) nutchMetadata.add(tikaMDName, v);
    }
    if (outlinks.length == 0) {
        outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}
Also used : MalformedURLException(java.net.MalformedURLException) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.tika.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ParseStatus(org.apache.nutch.parse.ParseStatus) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) MalformedURLException(java.net.MalformedURLException) Parser(org.apache.tika.parser.Parser) HTMLDocumentImpl(org.apache.html.dom.HTMLDocumentImpl) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) HtmlMapper(org.apache.tika.parser.html.HtmlMapper) ParseContext(org.apache.tika.parser.ParseContext) ParseImpl(org.apache.nutch.parse.ParseImpl) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) Map(java.util.Map) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) Link(org.apache.tika.sax.Link)

Example 28 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class ZipParser method getParse.

public ParseResult getParse(final Content content) {
    String resultText = null;
    String resultTitle = null;
    Outlink[] outlinks = null;
    List<Outlink> outLinksList = new ArrayList<Outlink>();
    try {
        final String contentLen = content.getMetadata().get(Response.CONTENT_LENGTH);
        final int len = Integer.parseInt(contentLen);
        if (LOG.isDebugEnabled()) {
            LOG.debug("ziplen: " + len);
        }
        final byte[] contentInBytes = content.getContent();
        if (contentLen != null && contentInBytes.length != len) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + contentInBytes.length + " bytes. Parser can't handle incomplete zip file.").getEmptyParseResult(content.getUrl(), getConf());
        }
        ZipTextExtractor extractor = new ZipTextExtractor(getConf());
        // extract text
        resultText = extractor.extractText(new ByteArrayInputStream(contentInBytes), content.getUrl(), outLinksList);
    } catch (Exception e) {
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as Zip document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    }
    if (resultText == null) {
        resultText = "";
    }
    if (resultTitle == null) {
        resultTitle = "";
    }
    outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
    final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, resultTitle, outlinks, content.getMetadata());
    if (LOG.isTraceEnabled()) {
        LOG.trace("Zip file parsed sucessfully !!");
    }
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 29 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestIndexingFilters method testNutchDocumentNullIndexingFilter.

/**
 * Test behaviour when NutchDOcument is null
 */
@Test
public void testNutchDocumentNullIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    IndexingFilters filters = new IndexingFilters(conf);
    NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertNull(doc);
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 30 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestIndexingFilters method testFilterCacheIndexingFilter.

/**
 * Test behaviour when reset the index filter order will not take effect
 *
 * @throws IndexingException
 */
@Test
public void testFilterCacheIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
    IndexingFilters filters1 = new IndexingFilters(conf);
    NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    // add another index filter
    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
    // set content metadata
    Metadata md = new Metadata();
    md.add("example", "data");
    // set content metadata property defined in MetadataIndexer
    conf.set("index.content.md", "example");
    // add MetadataIndxer filter
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters2 = new IndexingFilters(conf);
    NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames().size());
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Aggregations

ParseImpl (org.apache.nutch.parse.ParseImpl)31 ParseData (org.apache.nutch.parse.ParseData)29 Text (org.apache.hadoop.io.Text)21 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)21 ParseStatus (org.apache.nutch.parse.ParseStatus)21 Inlinks (org.apache.nutch.crawl.Inlinks)20 Outlink (org.apache.nutch.parse.Outlink)17 Test (org.junit.Test)17 NutchDocument (org.apache.nutch.indexer.NutchDocument)16 Metadata (org.apache.nutch.metadata.Metadata)15 Configuration (org.apache.hadoop.conf.Configuration)13 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)13 URL (java.net.URL)6 IOException (java.io.IOException)5 Inlink (org.apache.nutch.crawl.Inlink)5 Parse (org.apache.nutch.parse.Parse)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 ArrayList (java.util.ArrayList)4 ParseResult (org.apache.nutch.parse.ParseResult)4 MalformedURLException (java.net.MalformedURLException)3