Search in sources :

Example 11 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class IWorkPackageParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
    ZipArchiveEntry entry = zip.getNextZipEntry();
    while (entry != null) {
        if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
            entry = zip.getNextZipEntry();
            continue;
        }
        InputStream entryStream = new BufferedInputStream(zip, 4096);
        entryStream.mark(4096);
        IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
        entryStream.reset();
        if (type != null) {
            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
            ContentHandler contentHandler;
            switch(type) {
                case KEYNOTE:
                    contentHandler = new KeynoteContentHandler(xhtml, metadata);
                    break;
                case NUMBERS:
                    contentHandler = new NumbersContentHandler(xhtml, metadata);
                    break;
                case PAGES:
                    contentHandler = new PagesContentHandler(xhtml, metadata);
                    break;
                case ENCRYPTED:
                    // We can't do anything for the file right now
                    contentHandler = null;
                    break;
                default:
                    throw new TikaException("Unhandled iWorks file " + type);
            }
            metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
            xhtml.startDocument();
            if (contentHandler != null) {
                context.getSAXParser().parse(new CloseShieldInputStream(entryStream), new OfflineContentHandler(contentHandler));
            }
            xhtml.endDocument();
        }
        entry = zip.getNextZipEntry();
    }
// Don't close the zip InputStream (TIKA-1117).
}
Also used : TikaException(org.apache.tika.exception.TikaException) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) BufferedInputStream(java.io.BufferedInputStream) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) InputStream(java.io.InputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) ContentHandler(org.xml.sax.ContentHandler) OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) BufferedInputStream(java.io.BufferedInputStream) ZipArchiveEntry(org.apache.commons.compress.archivers.zip.ZipArchiveEntry) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 12 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class HtmlParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        String previous = metadata.get(Metadata.CONTENT_TYPE);
        MediaType contentType = null;
        if (previous == null || previous.startsWith("text/html")) {
            contentType = new MediaType(MediaType.TEXT_HTML, charset);
        } else if (previous.startsWith("application/xhtml+xml")) {
            contentType = new MediaType(XHTML, charset);
        } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
            contentType = new MediaType(WAP_XHTML, charset);
        } else if (previous.startsWith("application/x-asp")) {
            contentType = new MediaType(X_ASP, charset);
        }
        if (contentType != null) {
            metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
        }
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
        // Get the HTML mapper from the parse context
        HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());
        // Parse the HTML document
        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
        // Use schema from context or default
        Schema schema = context.get(Schema.class, HTML_SCHEMA);
        // TIKA-528: Reuse share schema to avoid heavy instantiation
        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
        // TIKA-599: Shared schema is thread-safe only if bogons are ignored
        parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
        parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));
        parser.parse(reader.asInputSource());
    }
}
Also used : HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Schema(org.ccil.cowan.tagsoup.Schema) Charset(java.nio.charset.Charset) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) MediaType(org.apache.tika.mime.MediaType) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 13 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class SourceCodeParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        String mediaType = metadata.get(Metadata.CONTENT_TYPE);
        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (mediaType != null && name != null) {
            MediaType type = MediaType.parse(mediaType);
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
            StringBuilder out = new StringBuilder();
            String line;
            int nbLines = 0;
            while ((line = reader.readLine()) != null) {
                out.append(line + System.getProperty("line.separator"));
                String author = parserAuthor(line);
                if (author != null) {
                    metadata.add(TikaCoreProperties.CREATOR, author);
                }
                nbLines++;
            }
            metadata.set("LoC", String.valueOf(nbLines));
            Renderer renderer = getRenderer(type.toString());
            String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
            Schema schema = context.get(Schema.class, HTML_SCHEMA);
            org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
            parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
            parser.setContentHandler(handler);
            parser.parse(new InputSource(new StringReader(codeAsHtml)));
        }
    }
}
Also used : InputSource(org.xml.sax.InputSource) HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Schema(org.ccil.cowan.tagsoup.Schema) Charset(java.nio.charset.Charset) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) Renderer(com.uwyn.jhighlight.renderer.Renderer) StringReader(java.io.StringReader) MediaType(org.apache.tika.mime.MediaType) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 14 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class Pkcs7Parser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    try {
        DigestCalculatorProvider digestCalculatorProvider = new JcaDigestCalculatorProviderBuilder().setProvider("BC").build();
        CMSSignedDataParser parser = new CMSSignedDataParser(digestCalculatorProvider, new CloseShieldInputStream(stream));
        try {
            CMSTypedStream content = parser.getSignedContent();
            if (content == null) {
                throw new TikaException("cannot parse detached pkcs7 signature (no signed data to parse)");
            }
            try (InputStream input = content.getContentStream()) {
                Parser delegate = context.get(Parser.class, EmptyParser.INSTANCE);
                delegate.parse(input, handler, metadata, context);
            }
        } finally {
            parser.close();
        }
    } catch (OperatorCreationException e) {
        throw new TikaException("Unable to create DigestCalculatorProvider", e);
    } catch (CMSException e) {
        throw new TikaException("Unable to parse pkcs7 signed data", e);
    }
}
Also used : CMSSignedDataParser(org.bouncycastle.cms.CMSSignedDataParser) TikaException(org.apache.tika.exception.TikaException) DigestCalculatorProvider(org.bouncycastle.operator.DigestCalculatorProvider) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) InputStream(java.io.InputStream) JcaDigestCalculatorProviderBuilder(org.bouncycastle.operator.jcajce.JcaDigestCalculatorProviderBuilder) OperatorCreationException(org.bouncycastle.operator.OperatorCreationException) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) CMSTypedStream(org.bouncycastle.cms.CMSTypedStream) Parser(org.apache.tika.parser.Parser) AbstractParser(org.apache.tika.parser.AbstractParser) CMSSignedDataParser(org.bouncycastle.cms.CMSSignedDataParser) EmptyParser(org.apache.tika.parser.EmptyParser) CMSException(org.bouncycastle.cms.CMSException)

Example 15 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class FeedParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // set the encoding?
    try {
        SyndFeed feed = new SyndFeedInput().build(new InputSource(new CloseShieldInputStream(stream)));
        String title = stripTags(feed.getTitleEx());
        String description = stripTags(feed.getDescriptionEx());
        metadata.set(TikaCoreProperties.TITLE, title);
        metadata.set(TikaCoreProperties.DESCRIPTION, description);
        // store the other fields in the metadata
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.element("h1", title);
        xhtml.element("p", description);
        xhtml.startElement("ul");
        for (Object e : feed.getEntries()) {
            SyndEntry entry = (SyndEntry) e;
            String link = entry.getLink();
            if (link != null) {
                xhtml.startElement("li");
                xhtml.startElement("a", "href", link);
                xhtml.characters(stripTags(entry.getTitleEx()));
                xhtml.endElement("a");
                SyndContent content = entry.getDescription();
                if (content != null) {
                    xhtml.newline();
                    xhtml.characters(stripTags(content));
                }
                xhtml.endElement("li");
            }
        }
        xhtml.endElement("ul");
        xhtml.endDocument();
    } catch (FeedException e) {
        throw new TikaException("RSS parse error", e);
    }
}
Also used : SyndFeed(com.rometools.rome.feed.synd.SyndFeed) InputSource(org.xml.sax.InputSource) TikaException(org.apache.tika.exception.TikaException) SyndContent(com.rometools.rome.feed.synd.SyndContent) SyndFeedInput(com.rometools.rome.io.SyndFeedInput) SyndEntry(com.rometools.rome.feed.synd.SyndEntry) FeedException(com.rometools.rome.io.FeedException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Aggregations

CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)29 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 TikaException (org.apache.tika.exception.TikaException)12 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)8 InputStream (java.io.InputStream)7 TikaInputStream (org.apache.tika.io.TikaInputStream)7 AutoDetectReader (org.apache.tika.detect.AutoDetectReader)6 MediaType (org.apache.tika.mime.MediaType)6 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 Charset (java.nio.charset.Charset)4 TikaConfig (org.apache.tika.config.TikaConfig)4 SAXException (org.xml.sax.SAXException)4 BufferedInputStream (java.io.BufferedInputStream)3 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)3 Metadata (org.apache.tika.metadata.Metadata)3 TaggedContentHandler (org.apache.tika.sax.TaggedContentHandler)3 InputSource (org.xml.sax.InputSource)3 IOException (java.io.IOException)2 SAXParser (javax.xml.parsers.SAXParser)2 ZipArchiveEntry (org.apache.commons.compress.archivers.zip.ZipArchiveEntry)2