Search in sources :

Example 11 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class ZipContainerDetector method detect.

public MediaType detect(InputStream input, Metadata metadata) throws IOException {
    // Check if we have access to the document
    if (input == null) {
        return MediaType.OCTET_STREAM;
    }
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tis = TikaInputStream.get(input, tmp);
        // enough for all known formats
        byte[] prefix = new byte[1024];
        int length = tis.peek(prefix);
        MediaType type = detectArchiveFormat(prefix, length);
        if (PackageParser.isZipArchive(type) && TikaInputStream.isTikaInputStream(input)) {
            return detectZipFormat(tis);
        } else if (!type.equals(MediaType.OCTET_STREAM)) {
            return type;
        } else {
            return detectCompressorFormat(prefix, length);
        }
    } finally {
        try {
            tmp.dispose();
        } catch (TikaException e) {
        // ignore
        }
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) MediaType(org.apache.tika.mime.MediaType)

Example 12 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class AutoDetectParserTest method testOggFlacAudio.

/**
     * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
     *  have been correctly included, and are available
     */
@SuppressWarnings("deprecation")
@Test
public void testOggFlacAudio() throws Exception {
    // The three test files should all have similar test data
    String[] testFiles = new String[] { "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga", "testOPUS.opus" };
    MediaType[] mediaTypes = new MediaType[] { MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE), MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS) };
    // Check we can load the parsers, and they claim to do the right things
    VorbisParser vParser = new VorbisParser();
    assertNotNull("Parser not found for " + mediaTypes[0], vParser.getSupportedTypes(new ParseContext()));
    FlacParser fParser = new FlacParser();
    assertNotNull("Parser not found for " + mediaTypes[1], fParser.getSupportedTypes(new ParseContext()));
    assertNotNull("Parser not found for " + mediaTypes[2], fParser.getSupportedTypes(new ParseContext()));
    OpusParser oParser = new OpusParser();
    assertNotNull("Parser not found for " + mediaTypes[3], oParser.getSupportedTypes(new ParseContext()));
    // Check we found the parser
    CompositeParser parser = (CompositeParser) tika.getParser();
    for (MediaType mt : mediaTypes) {
        assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt));
    }
    // Have each file parsed, and check
    for (int i = 0; i < testFiles.length; i++) {
        String file = testFiles[i];
        try (InputStream input = AutoDetectParserTest.class.getResourceAsStream("/test-documents/" + file)) {
            if (input == null) {
                fail("Could not find test file " + file);
            }
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new AutoDetectParser(tika).parse(input, handler, metadata);
            assertEquals("Incorrect content type for " + file, mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
            // Check some of the common metadata
            // Old style metadata
            assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
            assertEquals("Test Title", metadata.get(Metadata.TITLE));
            // New style metadata
            assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
            // Check some of the XMPDM metadata
            if (!file.endsWith(".opus")) {
                assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
            }
            assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
            assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
            assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
            // Check some of the text
            String content = handler.toString();
            assertTrue(content.contains("Test Title"));
            assertTrue(content.contains("Test Artist"));
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) VorbisParser(org.gagravarr.tika.VorbisParser) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) MediaType(org.apache.tika.mime.MediaType) FlacParser(org.gagravarr.tika.FlacParser) OpusParser(org.gagravarr.tika.OpusParser) Test(org.junit.Test)

Example 13 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class TikaParserConfigTest method testMimeExcludeInclude.

@Test
public void testMimeExcludeInclude() throws Exception {
    TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
    assertNotNull(config.getParser());
    assertNotNull(config.getDetector());
    Parser parser = config.getParser();
    MediaType PDF = MediaType.application("pdf");
    MediaType JPEG = MediaType.image("jpeg");
    // Has two parsers
    assertEquals(CompositeParser.class, parser.getClass());
    CompositeParser cParser = (CompositeParser) parser;
    assertEquals(2, cParser.getAllComponentParsers().size());
    // Both are decorated
    assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator);
    assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator);
    ParserDecorator p0 = (ParserDecorator) cParser.getAllComponentParsers().get(0);
    ParserDecorator p1 = (ParserDecorator) cParser.getAllComponentParsers().get(1);
    // DefaultParser will be wrapped with excludes
    assertEquals(DefaultParser.class, p0.getWrappedParser().getClass());
    assertNotContained(PDF, p0.getSupportedTypes(context));
    assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context));
    assertNotContained(JPEG, p0.getSupportedTypes(context));
    assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context));
    // Will have an empty parser for PDF
    assertEquals(EmptyParser.class, p1.getWrappedParser().getClass());
    assertEquals(1, p1.getSupportedTypes(context).size());
    assertContains(PDF, p1.getSupportedTypes(context));
    assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context));
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test)

Example 14 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class Icu4jEncodingDetector method detect.

public Charset detect(InputStream input, Metadata metadata) throws IOException {
    if (input == null) {
        return null;
    }
    CharsetDetector detector = new CharsetDetector();
    String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
    String incomingType = metadata.get(Metadata.CONTENT_TYPE);
    if (incomingCharset == null && incomingType != null) {
        // TIKA-341: Use charset in content-type
        MediaType mt = MediaType.parse(incomingType);
        if (mt != null) {
            incomingCharset = mt.getParameters().get("charset");
        }
    }
    if (incomingCharset != null) {
        String cleaned = CharsetUtils.clean(incomingCharset);
        if (cleaned != null) {
            detector.setDeclaredEncoding(cleaned);
        } else {
        // TODO: log a warning?
        }
    }
    // TIKA-341 without enabling input filtering (stripping of tags)
    // short HTML tests don't work well
    detector.enableInputFilter(true);
    detector.setText(input);
    for (CharsetMatch match : detector.detectAll()) {
        try {
            return CharsetUtils.forName(match.getName());
        } catch (Exception e) {
        // ignore
        }
    }
    return null;
}
Also used : MediaType(org.apache.tika.mime.MediaType) IOException(java.io.IOException)

Example 15 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class MediaTypeExample method describeMediaType.

public static void describeMediaType() {
    MediaType type = MediaType.parse("text/plain; charset=UTF-8");
    System.out.println("type:    " + type.getType());
    System.out.println("subtype: " + type.getSubtype());
    Map<String, String> parameters = type.getParameters();
    System.out.println("parameters:");
    for (String name : parameters.keySet()) {
        System.out.println("  " + name + "=" + parameters.get(name));
    }
}
Also used : MediaType(org.apache.tika.mime.MediaType)

Aggregations

MediaType (org.apache.tika.mime.MediaType)88 Test (org.junit.Test)28 Metadata (org.apache.tika.metadata.Metadata)27 InputStream (java.io.InputStream)23 TikaInputStream (org.apache.tika.io.TikaInputStream)17 Parser (org.apache.tika.parser.Parser)17 ParseContext (org.apache.tika.parser.ParseContext)16 IOException (java.io.IOException)15 TikaException (org.apache.tika.exception.TikaException)13 CompositeParser (org.apache.tika.parser.CompositeParser)13 ContentHandler (org.xml.sax.ContentHandler)13 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)12 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)12 TikaTest (org.apache.tika.TikaTest)10 Detector (org.apache.tika.detect.Detector)10 HashSet (java.util.HashSet)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 TikaConfig (org.apache.tika.config.TikaConfig)7 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)7 ArrayList (java.util.ArrayList)6