Search in sources :

Example 71 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class DefaultParser method getParsers.

@Override
public Map<MediaType, Parser> getParsers(ParseContext context) {
    Map<MediaType, Parser> map = super.getParsers(context);
    if (loader != null) {
        // Add dynamic parser service (they always override static ones)
        MediaTypeRegistry registry = getMediaTypeRegistry();
        List<Parser> parsers = loader.loadDynamicServiceProviders(Parser.class);
        // best parser last
        Collections.reverse(parsers);
        for (Parser parser : parsers) {
            for (MediaType type : parser.getSupportedTypes(context)) {
                map.put(registry.normalize(type), parser);
            }
        }
    }
    return map;
}
Also used : MediaType(org.apache.tika.mime.MediaType) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry)

Example 72 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class HtmlParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        String previous = metadata.get(Metadata.CONTENT_TYPE);
        MediaType contentType = null;
        if (previous == null || previous.startsWith("text/html")) {
            contentType = new MediaType(MediaType.TEXT_HTML, charset);
        } else if (previous.startsWith("application/xhtml+xml")) {
            contentType = new MediaType(XHTML, charset);
        } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
            contentType = new MediaType(WAP_XHTML, charset);
        } else if (previous.startsWith("application/x-asp")) {
            contentType = new MediaType(X_ASP, charset);
        }
        if (contentType != null) {
            metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
        }
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
        // Get the HTML mapper from the parse context
        HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());
        // Parse the HTML document
        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
        // Use schema from context or default
        Schema schema = context.get(Schema.class, HTML_SCHEMA);
        // TIKA-528: Reuse share schema to avoid heavy instantiation
        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
        // TIKA-599: Shared schema is thread-safe only if bogons are ignored
        parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
        parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));
        parser.parse(reader.asInputSource());
    }
}
Also used : HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Schema(org.ccil.cowan.tagsoup.Schema) Charset(java.nio.charset.Charset) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) MediaType(org.apache.tika.mime.MediaType) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 73 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class SourceCodeParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        String mediaType = metadata.get(Metadata.CONTENT_TYPE);
        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (mediaType != null && name != null) {
            MediaType type = MediaType.parse(mediaType);
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
            StringBuilder out = new StringBuilder();
            String line;
            int nbLines = 0;
            while ((line = reader.readLine()) != null) {
                out.append(line + System.getProperty("line.separator"));
                String author = parserAuthor(line);
                if (author != null) {
                    metadata.add(TikaCoreProperties.CREATOR, author);
                }
                nbLines++;
            }
            metadata.set("LoC", String.valueOf(nbLines));
            Renderer renderer = getRenderer(type.toString());
            String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
            Schema schema = context.get(Schema.class, HTML_SCHEMA);
            org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
            parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
            parser.setContentHandler(handler);
            parser.parse(new InputSource(new StringReader(codeAsHtml)));
        }
    }
}
Also used : InputSource(org.xml.sax.InputSource) HTMLSchema(org.ccil.cowan.tagsoup.HTMLSchema) Schema(org.ccil.cowan.tagsoup.Schema) Charset(java.nio.charset.Charset) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) Renderer(com.uwyn.jhighlight.renderer.Renderer) StringReader(java.io.StringReader) MediaType(org.apache.tika.mime.MediaType) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 74 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class EnviHeaderParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Only outputting the MIME type as metadata
    metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);
    // The following code was taken from the TXTParser
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        // text contents of the xhtml
        String line;
        while ((line = reader.readLine()) != null) {
            xhtml.startElement("p");
            xhtml.characters(line);
            xhtml.endElement("p");
        }
        xhtml.endDocument();
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) Charset(java.nio.charset.Charset) MediaType(org.apache.tika.mime.MediaType) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 75 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TikaConfigSerializer method addParser.

private static Element addParser(Mode mode, Element rootElement, Document doc, Parser parser, ParserDecorator decorator) throws Exception {
    ParseContext context = new ParseContext();
    Set<MediaType> addedTypes = new TreeSet<>();
    Set<MediaType> excludedTypes = new TreeSet<>();
    if (decorator != null) {
        Set<MediaType> types = new TreeSet<>();
        types.addAll(decorator.getSupportedTypes(context));
        addedTypes.addAll(types);
        for (MediaType type : parser.getSupportedTypes(context)) {
            if (!types.contains(type)) {
                excludedTypes.add(type);
            }
            addedTypes.remove(type);
        }
    } else if (mode == Mode.STATIC_FULL) {
        addedTypes.addAll(parser.getSupportedTypes(context));
    }
    String className = parser.getClass().getCanonicalName();
    Element parserElement = doc.createElement("parser");
    parserElement.setAttribute("class", className);
    rootElement.appendChild(parserElement);
    for (MediaType type : addedTypes) {
        Element mimeElement = doc.createElement("mime");
        mimeElement.appendChild(doc.createTextNode(type.toString()));
        parserElement.appendChild(mimeElement);
    }
    for (MediaType type : excludedTypes) {
        Element mimeElement = doc.createElement("mime-exclude");
        mimeElement.appendChild(doc.createTextNode(type.toString()));
        parserElement.appendChild(mimeElement);
    }
    return parserElement;
}
Also used : TreeSet(java.util.TreeSet) Element(org.w3c.dom.Element) ParseContext(org.apache.tika.parser.ParseContext) MediaType(org.apache.tika.mime.MediaType)

Aggregations

MediaType (org.apache.tika.mime.MediaType)88 Test (org.junit.Test)28 Metadata (org.apache.tika.metadata.Metadata)27 InputStream (java.io.InputStream)23 TikaInputStream (org.apache.tika.io.TikaInputStream)17 Parser (org.apache.tika.parser.Parser)17 ParseContext (org.apache.tika.parser.ParseContext)16 IOException (java.io.IOException)15 TikaException (org.apache.tika.exception.TikaException)13 CompositeParser (org.apache.tika.parser.CompositeParser)13 ContentHandler (org.xml.sax.ContentHandler)13 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)12 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)12 TikaTest (org.apache.tika.TikaTest)10 Detector (org.apache.tika.detect.Detector)10 HashSet (java.util.HashSet)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 TikaConfig (org.apache.tika.config.TikaConfig)7 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)7 ArrayList (java.util.ArrayList)6