Search in sources :

Example 1 with HtmlParser

use of org.apache.tika.parser.html.HtmlParser in project tika by apache.

the class TikaResource method createParser.

@SuppressWarnings("serial")
public static Parser createParser() {
    final Parser parser = new AutoDetectParser(tikaConfig);
    Map<MediaType, Parser> parsers = ((AutoDetectParser) parser).getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    ((AutoDetectParser) parser).setParsers(parsers);
    ((AutoDetectParser) parser).setFallback(new Parser() {

        public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
            return parser.getSupportedTypes(parseContext);
        }

        public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
            throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
        }
    });
    if (digester != null) {
        return new DigestingParser(parser, digester);
    }
    return parser;
}
Also used : HtmlParser(org.apache.tika.parser.html.HtmlParser) Set(java.util.Set) WebApplicationException(javax.ws.rs.WebApplicationException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) DigestingParser(org.apache.tika.parser.DigestingParser) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser)

Example 2 with HtmlParser

use of org.apache.tika.parser.html.HtmlParser in project tika by apache.

the class OutlookExtractor method parse.

public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
    try {
        msg.setReturnNullOnMissingChunk(true);
        try {
            metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
        } catch (ChunkNotFoundException e) {
        }
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }
        // Start with the metadata
        String subject = msg.getSubject();
        Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
        String from = msg.getDisplayFrom();
        handleFromTo(headers, metadata);
        metadata.set(TikaCoreProperties.TITLE, subject);
        // TODO: Move to description in Tika 2.0
        metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null)
                    metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
            }
        } catch (ChunkNotFoundException he) {
        }
        for (Map.Entry<String, String[]> e : headers.entrySet()) {
            String headerKey = e.getKey();
            for (String headerValue : e.getValue()) {
                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
            }
        }
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            if (headers != null && headers.size() > 0) {
                for (Map.Entry<String, String[]> header : headers.entrySet()) {
                    String headerKey = header.getKey();
                    if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
                        String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MboxParser.parseDate(date);
                            metadata.set(TikaCoreProperties.CREATED, d);
                            metadata.set(TikaCoreProperties.MODIFIED, d);
                        } catch (ParseException e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(TikaCoreProperties.CREATED, date);
                            metadata.set(TikaCoreProperties.MODIFIED, date);
                        }
                        break;
                    }
                }
            }
        }
        xhtml.element("h1", subject);
        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
        }
        xhtml.endElement("dl");
        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }
        boolean doneBody = false;
        xhtml.startElement("div", "class", "message-body");
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
                if (htmlParser == null) {
                    htmlParser = new HtmlParser();
                }
                htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
                doneBody = true;
            }
        }
        if (rtfChunk != null && !doneBody) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
            Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
            if (rtfParser == null) {
                rtfParser = new RTFParser();
            }
            rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
            doneBody = true;
        }
        if (textChunk != null && !doneBody) {
            xhtml.element("p", ((StringChunk) textChunk).getValue());
        }
        xhtml.endElement("div");
        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            xhtml.startElement("div", "class", "attachment-entry");
            String filename = null;
            if (attachment.getAttachLongFileName() != null) {
                filename = attachment.getAttachLongFileName().getValue();
            } else if (attachment.getAttachFileName() != null) {
                filename = attachment.getAttachFileName().getValue();
            }
            if (filename != null && filename.length() > 0) {
                xhtml.element("h1", filename);
            }
            if (attachment.getAttachData() != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
            }
            if (attachment.getAttachmentDirectory() != null) {
                handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
            }
            xhtml.endElement("div");
        }
    } catch (ChunkNotFoundException e) {
        throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
    } finally {
    //You'd think you'd want to call msg.close().
    //Don't do that.  That closes down the file system.
    //If an msg has multiple msg attachments, some of them
    //can reside in the same file system.  After the first
    //child is read, the fs is closed, and the other children
    //get a java.nio.channels.ClosedChannelException
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) MAPIRtfAttribute(org.apache.poi.hmef.attribute.MAPIRtfAttribute) TikaException(org.apache.tika.exception.TikaException) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) Metadata(org.apache.tika.metadata.Metadata) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) ByteChunk(org.apache.poi.hsmf.datatypes.ByteChunk) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Chunk(org.apache.poi.hsmf.datatypes.Chunk) Date(java.util.Date) StringChunk(org.apache.poi.hsmf.datatypes.StringChunk) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) MboxParser(org.apache.tika.parser.mbox.MboxParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) RTFParser(org.apache.tika.parser.rtf.RTFParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(java.text.ParseException) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks)

Example 3 with HtmlParser

use of org.apache.tika.parser.html.HtmlParser in project tika by apache.

the class TIAParsingExample method useHtmlParser.

public static void useHtmlParser() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    Parser parser = new HtmlParser();
    parser.parse(stream, handler, metadata, context);
}
Also used : HtmlParser(org.apache.tika.parser.html.HtmlParser) ByteArrayInputStream(java.io.ByteArrayInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 4 with HtmlParser

use of org.apache.tika.parser.html.HtmlParser in project tika by apache.

the class ChmParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    ChmExtractor chmExtractor = new ChmExtractor(stream);
    // metadata
    metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
    // content
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, context);
    if (htmlParser == null) {
        htmlParser = new HtmlParser();
    }
    for (DirectoryListingEntry entry : chmExtractor.getChmDirList().getDirectoryListingEntryList()) {
        final String entryName = entry.getName();
        if (entryName.endsWith(".html") || entryName.endsWith(".htm")) {
            //                AttributesImpl attrs = new AttributesImpl();
            //                attrs.addAttribute("", "name", "name", "String", entryName);
            //                xhtml.startElement("", "document", "document", attrs);
            byte[] data = chmExtractor.extractChmEntry(entry);
            parsePage(data, htmlParser, xhtml, context);
        //                xhtml.endElement("", "", "document");
        }
    }
    xhtml.endDocument();
}
Also used : HtmlParser(org.apache.tika.parser.html.HtmlParser) ChmExtractor(org.apache.tika.parser.chm.core.ChmExtractor) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) Parser(org.apache.tika.parser.Parser) AbstractParser(org.apache.tika.parser.AbstractParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) DirectoryListingEntry(org.apache.tika.parser.chm.accessor.DirectoryListingEntry)

Example 5 with HtmlParser

use of org.apache.tika.parser.html.HtmlParser in project tika by apache.

the class TIAParsingExample method useCompositeParser.

public static void useCompositeParser() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    ParseContext context = new ParseContext();
    Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>();
    parsersByType.put(MediaType.parse("text/html"), new HtmlParser());
    parsersByType.put(MediaType.parse("application/xml"), new XMLParser());
    CompositeParser parser = new CompositeParser();
    parser.setParsers(parsersByType);
    parser.setFallback(new TXTParser());
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "text/html");
    parser.parse(stream, handler, metadata, context);
}
Also used : HashMap(java.util.HashMap) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) CompositeParser(org.apache.tika.parser.CompositeParser) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseContext(org.apache.tika.parser.ParseContext) TXTParser(org.apache.tika.parser.txt.TXTParser) MediaType(org.apache.tika.mime.MediaType) XMLParser(org.apache.tika.parser.xml.XMLParser)

Aggregations

HtmlParser (org.apache.tika.parser.html.HtmlParser)8 Metadata (org.apache.tika.metadata.Metadata)7 ParseContext (org.apache.tika.parser.ParseContext)6 Parser (org.apache.tika.parser.Parser)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 InputStream (java.io.InputStream)4 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)3 LinkContentHandler (org.apache.tika.sax.LinkContentHandler)3 FileInputStream (java.io.FileInputStream)2 Map (java.util.Map)2 Set (java.util.Set)2 GZIPInputStream (java.util.zip.GZIPInputStream)2 TikaInputStream (org.apache.tika.io.TikaInputStream)2 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)2 ColumnMetadata (org.talend.dataprep.api.dataset.ColumnMetadata)2 TDPException (org.talend.dataprep.exception.TDPException)2 ContentHandler (org.xml.sax.ContentHandler)2 ActionManager (com.adobe.acs.commons.fam.ActionManager)1 ProcessDefinition (com.adobe.acs.commons.mcp.ProcessDefinition)1 ProcessInstance (com.adobe.acs.commons.mcp.ProcessInstance)1