Search in sources :

Example 31 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class MagicDetectorTest method testDetectRegExOptions.

@Test
public void testDetectRegExOptions() throws Exception {
    String pattern = "(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) " + "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}" + "(?:HTML|html) 4\\.01";
    String data = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\"" + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" + "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";
    String data1 = "<!DOCTYPE html PUBLIC \"-//W3C//dtd html 4.01//EN\"" + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" + "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";
    String data2 = "<!DoCtYpE hTmL pUbLiC \"-//W3C//dTd HtMl 4.01//EN\"" + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" + "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";
    MediaType html = new MediaType("text", "html");
    Detector detector = new MagicDetector(html, pattern.getBytes(US_ASCII), null, true, 0, 0);
    assertDetect(detector, html, data);
    assertDetect(detector, html, data1);
    assertDetect(detector, MediaType.OCTET_STREAM, data2);
}
Also used : MediaType(org.apache.tika.mime.MediaType) Test(org.junit.Test)

Example 32 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class MagicDetectorTest method testDetectStreamReadProblems.

@Test
public void testDetectStreamReadProblems() throws Exception {
    byte[] data = "abcdefghijklmnopqrstuvwxyz0123456789".getBytes(US_ASCII);
    MediaType testMT = new MediaType("application", "test");
    Detector detector = new MagicDetector(testMT, data, null, false, 0, 0);
    // Deliberately prevent InputStream.read(...) from reading the entire
    // buffer in one go
    InputStream stream = new RestrictiveInputStream(data);
    assertEquals(testMT, detector.detect(stream, new Metadata()));
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) Test(org.junit.Test)

Example 33 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class MagicDetectorTest method testDetectString.

@Test
public void testDetectString() throws Exception {
    String data = "abcdEFGhijklmnoPQRstuvwxyz0123456789";
    MediaType testMT = new MediaType("application", "test");
    Detector detector;
    // Check regular String matching
    detector = MagicDetector.parse(testMT, "string", "0:20", "abcd", null);
    assertDetect(detector, testMT, data.getBytes(US_ASCII));
    detector = MagicDetector.parse(testMT, "string", "0:20", "cdEFGh", null);
    assertDetect(detector, testMT, data.getBytes(US_ASCII));
    // Check Little Endian and Big Endian utf-16 strings
    detector = MagicDetector.parse(testMT, "unicodeLE", "0:20", "cdEFGh", null);
    assertDetect(detector, testMT, data.getBytes(UTF_16LE));
    detector = MagicDetector.parse(testMT, "unicodeBE", "0:20", "cdEFGh", null);
    assertDetect(detector, testMT, data.getBytes(UTF_16BE));
    // Check case ignoring String matching
    detector = MagicDetector.parse(testMT, "stringignorecase", "0:20", "BcDeFgHiJKlm", null);
    assertDetect(detector, testMT, data.getBytes(US_ASCII));
}
Also used : MediaType(org.apache.tika.mime.MediaType) Test(org.junit.Test)

Example 34 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class MockParser method getSupportedTypes.

@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
    Set<MediaType> types = new HashSet<MediaType>();
    MediaType type = MediaType.application("mock+xml");
    types.add(type);
    return types;
}
Also used : MediaType(org.apache.tika.mime.MediaType) HashSet(java.util.HashSet)

Example 35 with MediaType

use of org.apache.tika.mime.MediaType in project tika by apache.

the class PackageParser method updateMediaType.

private void updateMediaType(ArchiveInputStream ais, MediaTypeRegistry mediaTypeRegistry, Metadata metadata) {
    MediaType type = getMediaType(ais);
    if (type.equals(MediaType.OCTET_STREAM)) {
        return;
    }
    //now see if the user or an earlier step has passed in a content type
    String incomingContentTypeString = metadata.get(CONTENT_TYPE);
    if (incomingContentTypeString == null) {
        metadata.set(CONTENT_TYPE, type.toString());
        return;
    }
    MediaType incomingMediaType = MediaType.parse(incomingContentTypeString);
    if (incomingMediaType == null) {
        metadata.set(CONTENT_TYPE, type.toString());
        return;
    }
    //leave in the specialization; otherwise set the detected
    if (!mediaTypeRegistry.isSpecializationOf(incomingMediaType, type)) {
        metadata.set(CONTENT_TYPE, type.toString());
        return;
    }
}
Also used : MediaType(org.apache.tika.mime.MediaType)

Aggregations

MediaType (org.apache.tika.mime.MediaType)88 Test (org.junit.Test)28 Metadata (org.apache.tika.metadata.Metadata)27 InputStream (java.io.InputStream)23 TikaInputStream (org.apache.tika.io.TikaInputStream)17 Parser (org.apache.tika.parser.Parser)17 ParseContext (org.apache.tika.parser.ParseContext)16 IOException (java.io.IOException)15 TikaException (org.apache.tika.exception.TikaException)13 CompositeParser (org.apache.tika.parser.CompositeParser)13 ContentHandler (org.xml.sax.ContentHandler)13 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)12 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)12 TikaTest (org.apache.tika.TikaTest)10 Detector (org.apache.tika.detect.Detector)10 HashSet (java.util.HashSet)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 TikaConfig (org.apache.tika.config.TikaConfig)7 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)7 ArrayList (java.util.ArrayList)6