use of org.apache.tika.mime.MediaType in project tika by apache.
the class MagicDetectorTest method testDetectRegExOptions.
@Test
public void testDetectRegExOptions() throws Exception {
String pattern = "(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) " + "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}" + "(?:HTML|html) 4\\.01";
String data = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\"" + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" + "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";
String data1 = "<!DOCTYPE html PUBLIC \"-//W3C//dtd html 4.01//EN\"" + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" + "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";
String data2 = "<!DoCtYpE hTmL pUbLiC \"-//W3C//dTd HtMl 4.01//EN\"" + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" + "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";
MediaType html = new MediaType("text", "html");
Detector detector = new MagicDetector(html, pattern.getBytes(US_ASCII), null, true, 0, 0);
assertDetect(detector, html, data);
assertDetect(detector, html, data1);
assertDetect(detector, MediaType.OCTET_STREAM, data2);
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class MagicDetectorTest method testDetectStreamReadProblems.
@Test
public void testDetectStreamReadProblems() throws Exception {
byte[] data = "abcdefghijklmnopqrstuvwxyz0123456789".getBytes(US_ASCII);
MediaType testMT = new MediaType("application", "test");
Detector detector = new MagicDetector(testMT, data, null, false, 0, 0);
// Deliberately prevent InputStream.read(...) from reading the entire
// buffer in one go
InputStream stream = new RestrictiveInputStream(data);
assertEquals(testMT, detector.detect(stream, new Metadata()));
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class MagicDetectorTest method testDetectString.
@Test
public void testDetectString() throws Exception {
String data = "abcdEFGhijklmnoPQRstuvwxyz0123456789";
MediaType testMT = new MediaType("application", "test");
Detector detector;
// Check regular String matching
detector = MagicDetector.parse(testMT, "string", "0:20", "abcd", null);
assertDetect(detector, testMT, data.getBytes(US_ASCII));
detector = MagicDetector.parse(testMT, "string", "0:20", "cdEFGh", null);
assertDetect(detector, testMT, data.getBytes(US_ASCII));
// Check Little Endian and Big Endian utf-16 strings
detector = MagicDetector.parse(testMT, "unicodeLE", "0:20", "cdEFGh", null);
assertDetect(detector, testMT, data.getBytes(UTF_16LE));
detector = MagicDetector.parse(testMT, "unicodeBE", "0:20", "cdEFGh", null);
assertDetect(detector, testMT, data.getBytes(UTF_16BE));
// Check case ignoring String matching
detector = MagicDetector.parse(testMT, "stringignorecase", "0:20", "BcDeFgHiJKlm", null);
assertDetect(detector, testMT, data.getBytes(US_ASCII));
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class MockParser method getSupportedTypes.
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
Set<MediaType> types = new HashSet<MediaType>();
MediaType type = MediaType.application("mock+xml");
types.add(type);
return types;
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class PackageParser method updateMediaType.
private void updateMediaType(ArchiveInputStream ais, MediaTypeRegistry mediaTypeRegistry, Metadata metadata) {
MediaType type = getMediaType(ais);
if (type.equals(MediaType.OCTET_STREAM)) {
return;
}
//now see if the user or an earlier step has passed in a content type
String incomingContentTypeString = metadata.get(CONTENT_TYPE);
if (incomingContentTypeString == null) {
metadata.set(CONTENT_TYPE, type.toString());
return;
}
MediaType incomingMediaType = MediaType.parse(incomingContentTypeString);
if (incomingMediaType == null) {
metadata.set(CONTENT_TYPE, type.toString());
return;
}
//leave in the specialization; otherwise set the detected
if (!mediaTypeRegistry.isSpecializationOf(incomingMediaType, type)) {
metadata.set(CONTENT_TYPE, type.toString());
return;
}
}
Aggregations