Search in sources :

Example 16 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class HtmlParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        String previous = metadata.get(Metadata.CONTENT_TYPE);
        MediaType contentType = null;
        if (previous == null || previous.startsWith("text/html")) {
            contentType = new MediaType(MediaType.TEXT_HTML, charset);
        } else if (previous.startsWith("application/xhtml+xml")) {
            contentType = new MediaType(XHTML, charset);
        } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
            contentType = new MediaType(WAP_XHTML, charset);
        } else if (previous.startsWith("application/x-asp")) {
            contentType = new MediaType(X_ASP, charset);
        }
        if (contentType != null) {
            metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
        }
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
        // Get the HTML mapper from the parse context
        HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());
        // Parse the HTML document
        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
        // Use schema from context or default
        Schema schema = context.get(Schema.class, HTML_SCHEMA);
        // TIKA-528: Reuse share schema to avoid heavy instantiation
        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
        // TIKA-599: Shared schema is thread-safe only if bogons are ignored
        parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
        parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));
        parser.parse(reader.asInputSource());
    }
}
Also used : HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Schema(org.ccil.cowan.tagsoup.Schema) Charset(java.nio.charset.Charset) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) MediaType(org.apache.tika.mime.MediaType) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 17 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class SourceCodeParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        String mediaType = metadata.get(Metadata.CONTENT_TYPE);
        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (mediaType != null && name != null) {
            MediaType type = MediaType.parse(mediaType);
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
            StringBuilder out = new StringBuilder();
            String line;
            int nbLines = 0;
            while ((line = reader.readLine()) != null) {
                out.append(line + System.getProperty("line.separator"));
                String author = parserAuthor(line);
                if (author != null) {
                    metadata.add(TikaCoreProperties.CREATOR, author);
                }
                nbLines++;
            }
            metadata.set("LoC", String.valueOf(nbLines));
            Renderer renderer = getRenderer(type.toString());
            String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
            Schema schema = context.get(Schema.class, HTML_SCHEMA);
            org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
            parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
            parser.setContentHandler(handler);
            parser.parse(new InputSource(new StringReader(codeAsHtml)));
        }
    }
}
Also used : InputSource(org.xml.sax.InputSource) HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Schema(org.ccil.cowan.tagsoup.Schema) Charset(java.nio.charset.Charset) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) Renderer(com.uwyn.jhighlight.renderer.Renderer) StringReader(java.io.StringReader) MediaType(org.apache.tika.mime.MediaType) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 18 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class Pkcs7Parser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    try {
        DigestCalculatorProvider digestCalculatorProvider = new JcaDigestCalculatorProviderBuilder().setProvider("BC").build();
        CMSSignedDataParser parser = new CMSSignedDataParser(digestCalculatorProvider, new CloseShieldInputStream(stream));
        try {
            CMSTypedStream content = parser.getSignedContent();
            if (content == null) {
                throw new TikaException("cannot parse detached pkcs7 signature (no signed data to parse)");
            }
            try (InputStream input = content.getContentStream()) {
                Parser delegate = context.get(Parser.class, EmptyParser.INSTANCE);
                delegate.parse(input, handler, metadata, context);
            }
        } finally {
            parser.close();
        }
    } catch (OperatorCreationException e) {
        throw new TikaException("Unable to create DigestCalculatorProvider", e);
    } catch (CMSException e) {
        throw new TikaException("Unable to parse pkcs7 signed data", e);
    }
}
Also used : CMSSignedDataParser(org.bouncycastle.cms.CMSSignedDataParser) TikaException(org.apache.tika.exception.TikaException) DigestCalculatorProvider(org.bouncycastle.operator.DigestCalculatorProvider) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) InputStream(java.io.InputStream) JcaDigestCalculatorProviderBuilder(org.bouncycastle.operator.jcajce.JcaDigestCalculatorProviderBuilder) OperatorCreationException(org.bouncycastle.operator.OperatorCreationException) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) CMSTypedStream(org.bouncycastle.cms.CMSTypedStream) Parser(org.apache.tika.parser.Parser) AbstractParser(org.apache.tika.parser.AbstractParser) CMSSignedDataParser(org.bouncycastle.cms.CMSSignedDataParser) EmptyParser(org.apache.tika.parser.EmptyParser) CMSException(org.bouncycastle.cms.CMSException)

Example 19 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class FeedParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // set the encoding?
    try {
        SyndFeed feed = new SyndFeedInput().build(new InputSource(new CloseShieldInputStream(stream)));
        String title = stripTags(feed.getTitleEx());
        String description = stripTags(feed.getDescriptionEx());
        metadata.set(TikaCoreProperties.TITLE, title);
        metadata.set(TikaCoreProperties.DESCRIPTION, description);
        // store the other fields in the metadata
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        xhtml.element("h1", title);
        xhtml.element("p", description);
        xhtml.startElement("ul");
        for (Object e : feed.getEntries()) {
            SyndEntry entry = (SyndEntry) e;
            String link = entry.getLink();
            if (link != null) {
                xhtml.startElement("li");
                xhtml.startElement("a", "href", link);
                xhtml.characters(stripTags(entry.getTitleEx()));
                xhtml.endElement("a");
                SyndContent content = entry.getDescription();
                if (content != null) {
                    xhtml.newline();
                    xhtml.characters(stripTags(content));
                }
                xhtml.endElement("li");
            }
        }
        xhtml.endElement("ul");
        xhtml.endDocument();
    } catch (FeedException e) {
        throw new TikaException("RSS parse error", e);
    }
}
Also used : SyndFeed(com.rometools.rome.feed.synd.SyndFeed) InputSource(org.xml.sax.InputSource) TikaException(org.apache.tika.exception.TikaException) SyndContent(com.rometools.rome.feed.synd.SyndContent) SyndFeedInput(com.rometools.rome.io.SyndFeedInput) SyndEntry(com.rometools.rome.feed.synd.SyndEntry) FeedException(com.rometools.rome.io.FeedException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 20 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class ImageParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    String type = metadata.get(Metadata.CONTENT_TYPE);
    if (type != null) {
        //  fix it up to the new one, so Java is happy
        if (OLD_BMP_TYPE.toString().equals(type)) {
            type = MAIN_BMP_TYPE.toString();
        }
        try {
            Iterator<ImageReader> iterator = ImageIO.getImageReadersByMIMEType(type);
            if (iterator.hasNext()) {
                ImageReader reader = iterator.next();
                try {
                    try (ImageInputStream imageStream = ImageIO.createImageInputStream(new CloseShieldInputStream(stream))) {
                        reader.setInput(imageStream);
                        metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
                        metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
                        metadata.set("height", Integer.toString(reader.getHeight(0)));
                        metadata.set("width", Integer.toString(reader.getWidth(0)));
                        loadMetadata(reader.getImageMetadata(0), metadata);
                    }
                } finally {
                    reader.dispose();
                }
            }
            // Translate certain Metadata tags from the ImageIO
            //  specific namespace into the general Tika one
            setIfPresent(metadata, "CommentExtensions CommentExtension", TikaCoreProperties.COMMENTS);
            setIfPresent(metadata, "markerSequence com", TikaCoreProperties.COMMENTS);
            setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
        } catch (IIOException e) {
            //  which Tika will just ignore.
            if (!(e.getMessage() != null && e.getMessage().equals("Unexpected block type 0!") && type.equals("image/gif"))) {
                throw new TikaException(type + " parse error", e);
            }
        }
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) ImageInputStream(javax.imageio.stream.ImageInputStream) IIOException(javax.imageio.IIOException) ImageReader(javax.imageio.ImageReader) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Aggregations

CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)28 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 TikaException (org.apache.tika.exception.TikaException)12 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)8 InputStream (java.io.InputStream)7 TikaInputStream (org.apache.tika.io.TikaInputStream)7 AutoDetectReader (org.apache.tika.detect.AutoDetectReader)6 MediaType (org.apache.tika.mime.MediaType)6 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 Charset (java.nio.charset.Charset)4 TikaConfig (org.apache.tika.config.TikaConfig)4 SAXException (org.xml.sax.SAXException)4 BufferedInputStream (java.io.BufferedInputStream)3 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)3 Metadata (org.apache.tika.metadata.Metadata)3 TaggedContentHandler (org.apache.tika.sax.TaggedContentHandler)3 InputSource (org.xml.sax.InputSource)3 IOException (java.io.IOException)2 SAXParser (javax.xml.parsers.SAXParser)2 ZipArchiveEntry (org.apache.commons.compress.archivers.zip.ZipArchiveEntry)2