use of org.apache.tika.mime.MediaType in project tika by apache.
the class ZipContainerDetector method detect.
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
// Check if we have access to the document
if (input == null) {
return MediaType.OCTET_STREAM;
}
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(input, tmp);
// enough for all known formats
byte[] prefix = new byte[1024];
int length = tis.peek(prefix);
MediaType type = detectArchiveFormat(prefix, length);
if (PackageParser.isZipArchive(type) && TikaInputStream.isTikaInputStream(input)) {
return detectZipFormat(tis);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
} else {
return detectCompressorFormat(prefix, length);
}
} finally {
try {
tmp.dispose();
} catch (TikaException e) {
// ignore
}
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class AutoDetectParserTest method testOggFlacAudio.
/**
* Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
* have been correctly included, and are available
*/
@SuppressWarnings("deprecation")
@Test
public void testOggFlacAudio() throws Exception {
// The three test files should all have similar test data
String[] testFiles = new String[] { "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga", "testOPUS.opus" };
MediaType[] mediaTypes = new MediaType[] { MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE), MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS) };
// Check we can load the parsers, and they claim to do the right things
VorbisParser vParser = new VorbisParser();
assertNotNull("Parser not found for " + mediaTypes[0], vParser.getSupportedTypes(new ParseContext()));
FlacParser fParser = new FlacParser();
assertNotNull("Parser not found for " + mediaTypes[1], fParser.getSupportedTypes(new ParseContext()));
assertNotNull("Parser not found for " + mediaTypes[2], fParser.getSupportedTypes(new ParseContext()));
OpusParser oParser = new OpusParser();
assertNotNull("Parser not found for " + mediaTypes[3], oParser.getSupportedTypes(new ParseContext()));
// Check we found the parser
CompositeParser parser = (CompositeParser) tika.getParser();
for (MediaType mt : mediaTypes) {
assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt));
}
// Have each file parsed, and check
for (int i = 0; i < testFiles.length; i++) {
String file = testFiles[i];
try (InputStream input = AutoDetectParserTest.class.getResourceAsStream("/test-documents/" + file)) {
if (input == null) {
fail("Could not find test file " + file);
}
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AutoDetectParser(tika).parse(input, handler, metadata);
assertEquals("Incorrect content type for " + file, mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
// Check some of the common metadata
// Old style metadata
assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
assertEquals("Test Title", metadata.get(Metadata.TITLE));
// New style metadata
assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
// Check some of the XMPDM metadata
if (!file.endsWith(".opus")) {
assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
}
assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
// Check some of the text
String content = handler.toString();
assertTrue(content.contains("Test Title"));
assertTrue(content.contains("Test Artist"));
}
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class TikaParserConfigTest method testMimeExcludeInclude.
@Test
public void testMimeExcludeInclude() throws Exception {
TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
assertNotNull(config.getParser());
assertNotNull(config.getDetector());
Parser parser = config.getParser();
MediaType PDF = MediaType.application("pdf");
MediaType JPEG = MediaType.image("jpeg");
// Has two parsers
assertEquals(CompositeParser.class, parser.getClass());
CompositeParser cParser = (CompositeParser) parser;
assertEquals(2, cParser.getAllComponentParsers().size());
// Both are decorated
assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator);
assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator);
ParserDecorator p0 = (ParserDecorator) cParser.getAllComponentParsers().get(0);
ParserDecorator p1 = (ParserDecorator) cParser.getAllComponentParsers().get(1);
// DefaultParser will be wrapped with excludes
assertEquals(DefaultParser.class, p0.getWrappedParser().getClass());
assertNotContained(PDF, p0.getSupportedTypes(context));
assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context));
assertNotContained(JPEG, p0.getSupportedTypes(context));
assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context));
// Will have an empty parser for PDF
assertEquals(EmptyParser.class, p1.getWrappedParser().getClass());
assertEquals(1, p1.getSupportedTypes(context).size());
assertContains(PDF, p1.getSupportedTypes(context));
assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context));
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class Icu4jEncodingDetector method detect.
public Charset detect(InputStream input, Metadata metadata) throws IOException {
if (input == null) {
return null;
}
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
String incomingType = metadata.get(Metadata.CONTENT_TYPE);
if (incomingCharset == null && incomingType != null) {
// TIKA-341: Use charset in content-type
MediaType mt = MediaType.parse(incomingType);
if (mt != null) {
incomingCharset = mt.getParameters().get("charset");
}
}
if (incomingCharset != null) {
String cleaned = CharsetUtils.clean(incomingCharset);
if (cleaned != null) {
detector.setDeclaredEncoding(cleaned);
} else {
// TODO: log a warning?
}
}
// TIKA-341 without enabling input filtering (stripping of tags)
// short HTML tests don't work well
detector.enableInputFilter(true);
detector.setText(input);
for (CharsetMatch match : detector.detectAll()) {
try {
return CharsetUtils.forName(match.getName());
} catch (Exception e) {
// ignore
}
}
return null;
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class MediaTypeExample method describeMediaType.
public static void describeMediaType() {
MediaType type = MediaType.parse("text/plain; charset=UTF-8");
System.out.println("type: " + type.getType());
System.out.println("subtype: " + type.getSubtype());
Map<String, String> parameters = type.getParameters();
System.out.println("parameters:");
for (String name : parameters.keySet()) {
System.out.println(" " + name + "=" + parameters.get(name));
}
}
Aggregations