Examples with ParseData - org.apache.nutch.parse.ParseData

Example 31 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class SmallStack method getParse.

@Override
public ParseResult getParse(Content content) {
    String text = null;
    Vector<Outlink> outlinks = new Vector<>();
    try {
        byte[] raw = content.getContent();
        String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
        if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), getConf());
        }
        ExtractText extractor = new ExtractText();
        // TagParser implements SWFTags and drives a SWFTagTypes interface
        TagParser parser = new TagParser(extractor);
        // use this instead to debug the file
        // TagParser parser = new TagParser( new SWFTagDumper(true, true) );
        // SWFReader reads an input file and drives a SWFTags interface
        SWFReader reader = new SWFReader(parser, new InStream(raw));
        // read the input SWF file and pass it through the interface pipeline
        reader.readFile();
        text = extractor.getText();
        String atext = extractor.getActionText();
        if (atext != null && atext.length() > 0)
            text += "\n--------\n" + atext;
        // harvest potential outlinks
        String[] links = extractor.getUrls();
        for (int i = 0; i < links.length; i++) {
            Outlink out = new Outlink(links[i], "");
            outlinks.add(out);
        }
        Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
        if (olinks != null)
            for (int i = 0; i < olinks.length; i++) {
                outlinks.add(olinks[i]);
            }
    } catch (Exception e) {
        // run time exception
        LOG.error("Error, runtime exception: ", e);
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    }
    if (text == null)
        text = "";
    Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}

Also used : Outlink(org.apache.nutch.parse.Outlink) TagParser(com.anotherbigidea.flash.readers.TagParser) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) SWFReader(com.anotherbigidea.flash.readers.SWFReader) ParseData(org.apache.nutch.parse.ParseData) InStream(com.anotherbigidea.io.InStream) ParseImpl(org.apache.nutch.parse.ParseImpl) Vector(java.util.Vector)

Example 32 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TikaParser method getParse.

@SuppressWarnings("deprecation")
public ParseResult getParse(Content content) {
    String mimeType = content.getContentType();
    boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
    String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();
    if (parser == null) {
        String message = "Can't retrieve Tika parser for mime-type " + mimeType;
        LOG.error(message);
        return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(content.getUrl(), getConf());
    }
    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);
    Metadata tikamd = new Metadata();
    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    ContentHandler domHandler;
    // Check whether to use Tika's BoilerplateContentHandler
    if (useBoilerpipe) {
        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler) new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
        bpHandler.setIncludeMarkup(true);
        domHandler = (ContentHandler) bpHandler;
    } else {
        DOMBuilder domBuilder = new DOMBuilder(doc, root);
        domBuilder.setUpperCaseElementNames(upperCaseElementNames);
        domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
        domHandler = (ContentHandler) domBuilder;
    }
    LinkContentHandler linkContentHandler = new LinkContentHandler();
    ParseContext context = new ParseContext();
    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
    if (HTMLMapper != null)
        context.set(HtmlMapper.class, HTMLMapper);
    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
    try {
        parser.parse(new ByteArrayInputStream(raw), (ContentHandler) teeContentHandler, tikamd, context);
    } catch (Exception e) {
        LOG.error("Error parsing " + content.getUrl(), e);
        return new ParseStatus(ParseStatus.FAILED, e.getMessage()).getEmptyParseResult(content.getUrl(), getConf());
    }
    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = tikamd.get("Content-Location");
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links (base URL = {}) ...", baseTag);
        }
        // pre-1233 outlink extraction
        // utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
        // Get outlinks from Tika
        List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
        utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
        if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
            continue;
        String[] values = tikamd.getValues(tikaMDName);
        for (String v : values) nutchMetadata.add(tikaMDName, v);
    }
    if (outlinks.length == 0) {
        outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}

Also used : MalformedURLException(java.net.MalformedURLException) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.tika.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ParseStatus(org.apache.nutch.parse.ParseStatus) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) MalformedURLException(java.net.MalformedURLException) Parser(org.apache.tika.parser.Parser) HTMLDocumentImpl(org.apache.html.dom.HTMLDocumentImpl) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) HtmlMapper(org.apache.tika.parser.html.HtmlMapper) ParseContext(org.apache.tika.parser.ParseContext) ParseImpl(org.apache.nutch.parse.ParseImpl) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) Map(java.util.Map) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) Link(org.apache.tika.sax.Link)

Example 33 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestFeedParser method testIt.

/**
 * <p>
 * The test method: tests out the following 2 asserts:
 * </p>
 *
 * <ul>
 * <li>There are 3 outlinks read from the sample rss file</li>
 * <li>The 3 outlinks read are in fact the correct outlinks from the sample
 * file</li>
 * </ul>
 */
@Test
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
        // check that there are 2 outlinks:
        // unlike the original parse-rss
        // tika ignores the URL and description of the channel
        // http://test.channel.com
        // http://www-scf.usc.edu/~mattmann/
        // http://www.nutch.org
        ParseData theParseData = parse.getData();
        Outlink[] theOutlinks = theParseData.getOutlinks();
        Assert.assertTrue("There aren't 2 outlinks read!", theOutlinks.length == 2);
        // now check to make sure that those are the two outlinks
        boolean hasLink1 = false, hasLink2 = false;
        for (int j = 0; j < theOutlinks.length; j++) {
            if (theOutlinks[j].getToUrl().equals("http://www-scf.usc.edu/~mattmann/")) {
                hasLink1 = true;
            }
            if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
                hasLink2 = true;
            }
        }
        if (!hasLink1 || !hasLink2) {
            Assert.fail("Outlinks read from sample rss file are not correct!");
        }
    }
}

Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Example 34 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class ZipParser method getParse.

public ParseResult getParse(final Content content) {
    String resultText = null;
    String resultTitle = null;
    Outlink[] outlinks = null;
    List<Outlink> outLinksList = new ArrayList<Outlink>();
    try {
        final String contentLen = content.getMetadata().get(Response.CONTENT_LENGTH);
        final int len = Integer.parseInt(contentLen);
        if (LOG.isDebugEnabled()) {
            LOG.debug("ziplen: " + len);
        }
        final byte[] contentInBytes = content.getContent();
        if (contentLen != null && contentInBytes.length != len) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + contentInBytes.length + " bytes. Parser can't handle incomplete zip file.").getEmptyParseResult(content.getUrl(), getConf());
        }
        ZipTextExtractor extractor = new ZipTextExtractor(getConf());
        // extract text
        resultText = extractor.extractText(new ByteArrayInputStream(contentInBytes), content.getUrl(), outLinksList);
    } catch (Exception e) {
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as Zip document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    }
    if (resultText == null) {
        resultText = "";
    }
    if (resultTitle == null) {
        resultTitle = "";
    }
    outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
    final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, resultTitle, outlinks, content.getMetadata());
    if (LOG.isTraceEnabled()) {
        LOG.trace("Zip file parsed sucessfully !!");
    }
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData));
}

Also used : Outlink(org.apache.nutch.parse.Outlink) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 35 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestIndexingFilters method testNutchDocumentNullIndexingFilter.

/**
 * Test behaviour when NutchDOcument is null
 */
@Test
public void testNutchDocumentNullIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    IndexingFilters filters = new IndexingFilters(conf);
    NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertNull(doc);
}

Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Aggregations

ParseData (org.apache.nutch.parse.ParseData)37 ParseImpl (org.apache.nutch.parse.ParseImpl)29 Text (org.apache.hadoop.io.Text)23 ParseStatus (org.apache.nutch.parse.ParseStatus)23 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)22 Outlink (org.apache.nutch.parse.Outlink)22 Inlinks (org.apache.nutch.crawl.Inlinks)19 Metadata (org.apache.nutch.metadata.Metadata)19 Test (org.junit.Test)19 NutchDocument (org.apache.nutch.indexer.NutchDocument)16 Configuration (org.apache.hadoop.conf.Configuration)14 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)14 Parse (org.apache.nutch.parse.Parse)9 URL (java.net.URL)7 ArrayList (java.util.ArrayList)6 ParseResult (org.apache.nutch.parse.ParseResult)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 IOException (java.io.IOException)5 Inlink (org.apache.nutch.crawl.Inlink)5 Content (org.apache.nutch.protocol.Content)5