Search in sources :

Example 1 with LinkContentHandler

use of org.apache.tika.sax.LinkContentHandler in project tika by apache.

the class RollbackSoftware method rollback.

public void rollback(File deployArea) throws IOException, SAXException, TikaException {
    LinkContentHandler handler = new LinkContentHandler();
    Metadata met = new Metadata();
    DeploymentAreaParser parser = new DeploymentAreaParser();
    parser.parse(IOUtils.toInputStream(deployArea.getAbsolutePath(), UTF_8), handler, met);
    List<Link> links = handler.getLinks();
    if (links.size() < 2)
        throw new IOException("Must have installed at least 2 versions!");
    Collections.sort(links, new Comparator<Link>() {

        public int compare(Link o1, Link o2) {
            return o1.getText().compareTo(o2.getText());
        }
    });
    this.updateVersion(links.get(links.size() - 2).getText());
}
Also used : LinkContentHandler(org.apache.tika.sax.LinkContentHandler) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) Link(org.apache.tika.sax.Link)

Example 2 with LinkContentHandler

use of org.apache.tika.sax.LinkContentHandler in project tika by apache.

the class HtmlParserTest method testCustomHtmlSchema.

// TIKA-1193
@Test
public void testCustomHtmlSchema() throws Exception {
    // Default schema does not allow tables inside anchors
    String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
    Metadata metadata = new Metadata();
    LinkContentHandler linkContentHandler = new LinkContentHandler();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), linkContentHandler, metadata, new ParseContext());
    // Expect no anchor text
    assertEquals("", linkContentHandler.getLinks().get(0).getText());
    // We'll change the schema to allow tables inside anchors!
    Schema schema = new HTMLSchema();
    schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
    ParseContext parseContext = new ParseContext();
    parseContext.set(Schema.class, schema);
    linkContentHandler = new LinkContentHandler();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), linkContentHandler, metadata, parseContext);
    // Expect anchor text
    assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Schema(org.ccil.cowan.tagsoup.Schema) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 3 with LinkContentHandler

use of org.apache.tika.sax.LinkContentHandler in project tika by apache.

the class TIAParsingExample method testTeeContentHandler.

public static void testTeeContentHandler(String filename) throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    Parser parser = new AutoDetectParser();
    LinkContentHandler linkCollector = new LinkContentHandler();
    try (OutputStream output = new FileOutputStream(new File(filename))) {
        ContentHandler handler = new TeeContentHandler(new BodyContentHandler(output), linkCollector);
        parser.parse(stream, handler, metadata, context);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ByteArrayInputStream(java.io.ByteArrayInputStream) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) FileOutputStream(java.io.FileOutputStream) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) File(java.io.File)

Example 4 with LinkContentHandler

use of org.apache.tika.sax.LinkContentHandler in project nutch by apache.

the class TikaParser method getParse.

@SuppressWarnings("deprecation")
public ParseResult getParse(Content content) {
    String mimeType = content.getContentType();
    boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
    String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
    URL base;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();
    if (parser == null) {
        String message = "Can't retrieve Tika parser for mime-type " + mimeType;
        LOG.error(message);
        return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(content.getUrl(), getConf());
    }
    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);
    Metadata tikamd = new Metadata();
    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    ContentHandler domHandler;
    // Check whether to use Tika's BoilerplateContentHandler
    if (useBoilerpipe) {
        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler) new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
        bpHandler.setIncludeMarkup(true);
        domHandler = (ContentHandler) bpHandler;
    } else {
        DOMBuilder domBuilder = new DOMBuilder(doc, root);
        domBuilder.setUpperCaseElementNames(upperCaseElementNames);
        domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
        domHandler = (ContentHandler) domBuilder;
    }
    LinkContentHandler linkContentHandler = new LinkContentHandler();
    ParseContext context = new ParseContext();
    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
    if (HTMLMapper != null)
        context.set(HtmlMapper.class, HTMLMapper);
    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
    try {
        parser.parse(new ByteArrayInputStream(raw), (ContentHandler) teeContentHandler, tikamd, context);
    } catch (Exception e) {
        LOG.error("Error parsing " + content.getUrl(), e);
        return new ParseStatus(ParseStatus.FAILED, e.getMessage()).getEmptyParseResult(content.getUrl(), getConf());
    }
    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
        LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    if (!metaTags.getNoIndex()) {
        // okay to index
        StringBuffer sb = new StringBuffer();
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting text...");
        }
        // extract text
        utils.getText(sb, root);
        text = sb.toString();
        sb.setLength(0);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting title...");
        }
        // extract title
        utils.getTitle(sb, root);
        title = sb.toString().trim();
    }
    if (!metaTags.getNoFollow()) {
        // okay to follow links
        // extract outlinks
        ArrayList<Outlink> l = new ArrayList<Outlink>();
        URL baseTag = base;
        String baseTagHref = tikamd.get("Content-Location");
        if (baseTagHref != null) {
            try {
                baseTag = new URL(base, baseTagHref);
            } catch (MalformedURLException e) {
                LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
            }
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Getting links (base URL = {}) ...", baseTag);
        }
        // pre-1233 outlink extraction
        // utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
        // Get outlinks from Tika
        List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
        utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
        outlinks = l.toArray(new Outlink[l.size()]);
        if (LOG.isTraceEnabled()) {
            LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
        }
    }
    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
        if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
            continue;
        String[] values = tikamd.getValues(tikaMDName);
        for (String v : values) nutchMetadata.add(tikaMDName, v);
    }
    if (outlinks.length == 0) {
        outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }
    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setArgs(new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) {
        // not okay to cache
        for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
}
Also used : MalformedURLException(java.net.MalformedURLException) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.tika.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ParseStatus(org.apache.nutch.parse.ParseStatus) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) DocumentFragment(org.w3c.dom.DocumentFragment) HTMLMetaTags(org.apache.nutch.parse.HTMLMetaTags) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) MalformedURLException(java.net.MalformedURLException) Parser(org.apache.tika.parser.Parser) HTMLDocumentImpl(org.apache.html.dom.HTMLDocumentImpl) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) HtmlMapper(org.apache.tika.parser.html.HtmlMapper) ParseContext(org.apache.tika.parser.ParseContext) ParseImpl(org.apache.nutch.parse.ParseImpl) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) Map(java.util.Map) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) Link(org.apache.tika.sax.Link)

Example 5 with LinkContentHandler

use of org.apache.tika.sax.LinkContentHandler in project acs-aem-commons by Adobe-Consulting-Services.

the class BrokenLinksReport method collectPaths.

/**
 * Collect references from a JCR property.
 * A property can be one of:
 * <ol>
 *     <li>A string containing a reference, e.g, fileReference=/content/dam/image.png. </li>
 *     <li>An array of strings, e.g, fileReference=[/content/dam/image1.png, /content/dam/image2.png]</li>
 *     <li>An html fragment containing links , e.g,
 *     <pre>
 *       &lt;p&gt;
 *         &lt;a href="/content/site/page.html"&gt;hello&lt;/a&gt;
 *         &lt;img src="/content/dam/image1.png"&gt;hello&lt;/a&gt;
 *       &lt;/p&gt;
 *     </pre>
 *     </li>
 * </ol>
 *
 * @param property an entry from a ValueMap
 * @param htmlFields  lst of properties containing html
 * @return stream containing extracted references
 */
static Stream<String> collectPaths(Map.Entry<String, Object> property, Set<String> htmlFields) {
    Object p = property.getValue();
    Stream<String> stream;
    if (p.getClass() == String[].class) {
        stream = Arrays.stream((String[]) p);
    } else if (p.getClass() == String.class) {
        stream = Stream.of((String) p);
    } else {
        stream = Stream.empty();
    }
    if (htmlFields.contains(property.getKey())) {
        stream = stream.flatMap(val -> {
            try {
                // parse html and extract links via underlying tagsoup library
                LinkContentHandler linkHandler = new LinkContentHandler();
                HtmlParser parser = new HtmlParser();
                parser.parse(new ByteArrayInputStream(val.getBytes("utf-8")), linkHandler, new Metadata(), new ParseContext());
                return linkHandler.getLinks().stream().map(Link::getUri);
            } catch (Exception e) {
                return Stream.empty();
            }
        });
    }
    return stream;
}
Also used : Arrays(java.util.Arrays) ResourceResolver(org.apache.sling.api.resource.ResourceResolver) ResourceUtil(org.apache.sling.api.resource.ResourceUtil) ProcessDefinition(com.adobe.acs.commons.mcp.ProcessDefinition) HashSet(java.util.HashSet) Metadata(org.apache.tika.metadata.Metadata) HtmlParser(org.apache.tika.parser.html.HtmlParser) RepositoryException(javax.jcr.RepositoryException) ByteArrayInputStream(java.io.ByteArrayInputStream) Map(java.util.Map) FormField(com.adobe.acs.commons.mcp.form.FormField) PersistenceException(org.apache.sling.api.resource.PersistenceException) Link(org.apache.tika.sax.Link) PathfieldComponent(com.adobe.acs.commons.mcp.form.PathfieldComponent) EnumMap(java.util.EnumMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Resource(org.apache.sling.api.resource.Resource) Set(java.util.Set) ActionManager(com.adobe.acs.commons.fam.ActionManager) Collectors(java.util.stream.Collectors) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) Serializable(java.io.Serializable) LoginException(org.apache.sling.api.resource.LoginException) List(java.util.List) GenericReport(com.adobe.acs.commons.mcp.model.GenericReport) Stream(java.util.stream.Stream) TreeFilteringResourceVisitor(com.adobe.acs.commons.util.visitors.TreeFilteringResourceVisitor) ParseContext(org.apache.tika.parser.ParseContext) CheckboxComponent(com.adobe.acs.commons.mcp.form.CheckboxComponent) Pattern(java.util.regex.Pattern) ProcessInstance(com.adobe.acs.commons.mcp.ProcessInstance) HtmlParser(org.apache.tika.parser.html.HtmlParser) ByteArrayInputStream(java.io.ByteArrayInputStream) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) RepositoryException(javax.jcr.RepositoryException) PersistenceException(org.apache.sling.api.resource.PersistenceException) LoginException(org.apache.sling.api.resource.LoginException)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)5 LinkContentHandler (org.apache.tika.sax.LinkContentHandler)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 ParseContext (org.apache.tika.parser.ParseContext)4 Link (org.apache.tika.sax.Link)3 Map (java.util.Map)2 ActionManager (com.adobe.acs.commons.fam.ActionManager)1 ProcessDefinition (com.adobe.acs.commons.mcp.ProcessDefinition)1 ProcessInstance (com.adobe.acs.commons.mcp.ProcessInstance)1 CheckboxComponent (com.adobe.acs.commons.mcp.form.CheckboxComponent)1 FormField (com.adobe.acs.commons.mcp.form.FormField)1 PathfieldComponent (com.adobe.acs.commons.mcp.form.PathfieldComponent)1 GenericReport (com.adobe.acs.commons.mcp.model.GenericReport)1 TreeFilteringResourceVisitor (com.adobe.acs.commons.util.visitors.TreeFilteringResourceVisitor)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 OutputStream (java.io.OutputStream)1