Search in sources :

Example 31 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class SmallStack method getParse.

public ParseResult getParse(Content content) {
    String text = null;
    Vector<Outlink> outlinks = new Vector<>();
    try {
        byte[] raw = content.getContent();
        String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
        if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + " bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), getConf());
        ExtractText extractor = new ExtractText();
        // TagParser implements SWFTags and drives a SWFTagTypes interface
        TagParser parser = new TagParser(extractor);
        // use this instead to debug the file
        // TagParser parser = new TagParser( new SWFTagDumper(true, true) );
        // SWFReader reads an input file and drives a SWFTags interface
        SWFReader reader = new SWFReader(parser, new InStream(raw));
        // read the input SWF file and pass it through the interface pipeline
        text = extractor.getText();
        String atext = extractor.getActionText();
        if (atext != null && atext.length() > 0)
            text += "\n--------\n" + atext;
        // harvest potential outlinks
        String[] links = extractor.getUrls();
        for (int i = 0; i < links.length; i++) {
            Outlink out = new Outlink(links[i], "");
        Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
        if (olinks != null)
            for (int i = 0; i < olinks.length; i++) {
    } catch (Exception e) {
        // run time exception
        LOG.error("Error, runtime exception: ", e);
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    if (text == null)
        text = "";
    Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links, content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
Also used : Outlink(org.apache.nutch.parse.Outlink) TagParser(com.anotherbigidea.flash.readers.TagParser) IOException( ParseStatus(org.apache.nutch.parse.ParseStatus) SWFReader(com.anotherbigidea.flash.readers.SWFReader) ParseData(org.apache.nutch.parse.ParseData) InStream( ParseImpl(org.apache.nutch.parse.ParseImpl) Vector(java.util.Vector)

Example 32 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TikaParser method getParse.

public ParseResult getParse(Content content) {
    String mimeType = content.getContentType();
    boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
    String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();
    if (parser == null) {
        String message = "Can't retrieve Tika parser for mime-type " + mimeType;
        return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(content.getUrl(), getConf());
    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);
    Metadata tikamd = new Metadata();
    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    DocumentFragment root = doc.createDocumentFragment();
    ContentHandler domHandler;
    // Check whether to use Tika's BoilerplateContentHandler
    if (useBoilerpipe) {
        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler) new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
        domHandler = (ContentHandler) bpHandler;
    } else {
        DOMBuilder domBuilder = new DOMBuilder(doc, root);
        domHandler = (ContentHandler) domBuilder;
    LinkContentHandler linkContentHandler = new LinkContentHandler();
    ParseContext context = new ParseContext();
    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
    if (HTMLMapper != null)
        context.set(HtmlMapper.class, HTMLMapper);
    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
    try {
        parser.parse(new ByteArrayInputStream(raw), (ContentHandler) teeContentHandler, tikamd, context);
    } catch (Exception e) {
        LOG.error("Error parsing " + content.getUrl(), e);
        return new ParseStatus(ParseStatus.FAILED, e.getMessage()).getEmptyParseResult(content.getUrl(), getConf());
    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = tikamd.get("Content-Location");
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links (base URL = {}) ...", baseTag);
        // pre-1233 outlink extraction
        // utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
        // Get outlinks from Tika
        List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
        utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
        if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
        String[] values = tikamd.getValues(tikaMDName);
        for (String v : values) nutchMetadata.add(tikaMDName, v);
    if (outlinks.length == 0) {
        outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    return filteredParse;
Also used : MalformedURLException( Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.tika.metadata.Metadata) ArrayList(java.util.ArrayList) URL( BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ParseStatus(org.apache.nutch.parse.ParseStatus) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) MalformedURLException( Parser(org.apache.tika.parser.Parser) HTMLDocumentImpl(org.apache.html.dom.HTMLDocumentImpl) ByteArrayInputStream( ParseData(org.apache.nutch.parse.ParseData) HtmlMapper(org.apache.tika.parser.html.HtmlMapper) ParseContext(org.apache.tika.parser.ParseContext) ParseImpl(org.apache.nutch.parse.ParseImpl) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) Map(java.util.Map) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) Link(org.apache.tika.sax.Link)

Example 33 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestFeedParser method testIt.

 * <p>
 * The test method: tests out the following 2 asserts:
 * </p>
 * <ul>
 * <li>There are 3 outlinks read from the sample rss file</li>
 * <li>The 3 outlinks read are in fact the correct outlinks from the sample
 * file</li>
 * </ul>
public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parse parse;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
        // check that there are 2 outlinks:
        // unlike the original parse-rss
        // tika ignores the URL and description of the channel
        ParseData theParseData = parse.getData();
        Outlink[] theOutlinks = theParseData.getOutlinks();
        Assert.assertTrue("There aren't 2 outlinks read!", theOutlinks.length == 2);
        // now check to make sure that those are the two outlinks
        boolean hasLink1 = false, hasLink2 = false;
        for (int j = 0; j < theOutlinks.length; j++) {
            if (theOutlinks[j].getToUrl().equals("")) {
                hasLink1 = true;
            if (theOutlinks[j].getToUrl().equals("")) {
                hasLink2 = true;
        if (!hasLink1 || !hasLink2) {
  "Outlinks read from sample rss file are not correct!");
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text( ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ParseData(org.apache.nutch.parse.ParseData) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) Test(org.junit.Test)

Example 34 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class ZipParser method getParse.

public ParseResult getParse(final Content content) {
    String resultText = null;
    String resultTitle = null;
    Outlink[] outlinks = null;
    List<Outlink> outLinksList = new ArrayList<Outlink>();
    try {
        final String contentLen = content.getMetadata().get(Response.CONTENT_LENGTH);
        final int len = Integer.parseInt(contentLen);
        if (LOG.isDebugEnabled()) {
            LOG.debug("ziplen: " + len);
        final byte[] contentInBytes = content.getContent();
        if (contentLen != null && contentInBytes.length != len) {
            return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + contentInBytes.length + " bytes. Parser can't handle incomplete zip file.").getEmptyParseResult(content.getUrl(), getConf());
        ZipTextExtractor extractor = new ZipTextExtractor(getConf());
        // extract text
        resultText = extractor.extractText(new ByteArrayInputStream(contentInBytes), content.getUrl(), outLinksList);
    } catch (Exception e) {
        return new ParseStatus(ParseStatus.FAILED, "Can't be handled as Zip document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    if (resultText == null) {
        resultText = "";
    if (resultTitle == null) {
        resultTitle = "";
    outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
    final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, resultTitle, outlinks, content.getMetadata());
    if (LOG.isTraceEnabled()) {
        LOG.trace("Zip file parsed sucessfully !!");
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData));
Also used : Outlink(org.apache.nutch.parse.Outlink) ArrayList(java.util.ArrayList) IOException( ParseStatus(org.apache.nutch.parse.ParseStatus) ByteArrayInputStream( ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 35 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestIndexingFilters method testNutchDocumentNullIndexingFilter.

 * Test behaviour when NutchDOcument is null
public void testNutchDocumentNullIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    IndexingFilters filters = new IndexingFilters(conf);
    NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(""), new CrawlDatum(), new Inlinks());
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text( Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)


ParseData (org.apache.nutch.parse.ParseData)37 ParseImpl (org.apache.nutch.parse.ParseImpl)29 Text ( ParseStatus (org.apache.nutch.parse.ParseStatus)23 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)22 Outlink (org.apache.nutch.parse.Outlink)22 Inlinks (org.apache.nutch.crawl.Inlinks)19 Metadata (org.apache.nutch.metadata.Metadata)19 Test (org.junit.Test)19 NutchDocument (org.apache.nutch.indexer.NutchDocument)16 Configuration (org.apache.hadoop.conf.Configuration)14 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)14 Parse (org.apache.nutch.parse.Parse)9 URL ( ArrayList (java.util.ArrayList)6 ParseResult (org.apache.nutch.parse.ParseResult)6 ByteArrayInputStream ( IOException ( Inlink (org.apache.nutch.crawl.Inlink)5 Content (org.apache.nutch.protocol.Content)5