use of org.apache.tika.mime.MediaType in project tika by apache.
the class TikaCLI method compareFileMagic.
/**
* Compares our mime types registry with the File(1) tool's
* directory of (uncompiled) Magic entries.
* (Well, those with mimetypes anyway)
* @param magicDir Path to the magic directory
*/
private void compareFileMagic(String magicDir) throws Exception {
Set<String> tikaLacking = new TreeSet<String>();
Set<String> tikaNoMagic = new TreeSet<String>();
// Sanity check
File dir = new File(magicDir);
if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() && (new File(dir, "vorbis")).exists()) {
// Looks plausible
} else {
throw new IllegalArgumentException(magicDir + " doesn't seem to hold uncompressed file magic entries");
}
// Find all the mimetypes in the directory
Set<String> fileMimes = new HashSet<String>();
for (File mf : dir.listFiles()) {
if (mf.isFile()) {
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(mf), UTF_8));
String line;
while ((line = r.readLine()) != null) {
if (line.startsWith("!:mime") || line.startsWith("#!:mime")) {
String mime = line.substring(7).trim();
fileMimes.add(mime);
}
}
r.close();
}
}
// See how those compare to the Tika ones
TikaConfig config = TikaConfig.getDefaultConfig();
MimeTypes mimeTypes = config.getMimeRepository();
MediaTypeRegistry registry = config.getMediaTypeRegistry();
for (String mime : fileMimes) {
try {
final MimeType type = mimeTypes.getRegisteredMimeType(mime);
if (type == null) {
// Tika doesn't know about this one
tikaLacking.add(mime);
} else {
// Tika knows about this one!
// Does Tika have magic for it?
boolean hasMagic = type.hasMagic();
// How about the children?
if (!hasMagic) {
for (MediaType child : registry.getChildTypes(type.getType())) {
MimeType childType = mimeTypes.getRegisteredMimeType(child.toString());
if (childType != null && childType.hasMagic()) {
hasMagic = true;
}
}
}
// How about the parents?
MimeType parentType = type;
while (parentType != null && !hasMagic) {
if (parentType.hasMagic()) {
// Has magic, fine
hasMagic = true;
} else {
// Check the parent next
MediaType parent = registry.getSupertype(type.getType());
if (parent == MediaType.APPLICATION_XML || parent == MediaType.TEXT_PLAIN || parent == MediaType.OCTET_STREAM) {
// Stop checking parents if we hit a top level type
parent = null;
}
if (parent != null) {
parentType = mimeTypes.getRegisteredMimeType(parent.toString());
} else {
parentType = null;
}
}
}
if (!hasMagic) {
tikaNoMagic.add(mime);
}
}
} catch (MimeTypeException e) {
// Broken entry in the file magic directory
// Silently skip
}
}
// Check how many tika knows about
int tikaTypes = 0;
int tikaAliases = 0;
for (MediaType type : registry.getTypes()) {
tikaTypes++;
tikaAliases += registry.getAliases(type).size();
}
// Report
System.out.println("Tika knows about " + tikaTypes + " unique mime types");
System.out.println("Tika knows about " + (tikaTypes + tikaAliases) + " mime types including aliases");
System.out.println("The File Magic directory knows about " + fileMimes.size() + " unique mime types");
System.out.println();
System.out.println("The following mime types are known to File but not Tika:");
for (String mime : tikaLacking) {
System.out.println(" " + mime);
}
System.out.println();
System.out.println("The following mime types from File have no Tika magic (but their children might):");
for (String mime : tikaNoMagic) {
System.out.println(" " + mime);
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class TikaCLI method displayParser.
private void displayParser(Parser p, boolean includeMimeTypes, boolean apt, int i) {
String decorated = null;
if (p instanceof ParserDecorator) {
ParserDecorator pd = (ParserDecorator) p;
decorated = " (Wrapped by " + pd.getDecorationName() + ")";
p = pd.getWrappedParser();
}
boolean isComposite = (p instanceof CompositeParser);
String name = p.getClass().getName();
if (apt) {
name = name.substring(0, name.lastIndexOf(".") + 1) + "{{{./api/" + name.replace(".", "/") + "}" + name.substring(name.lastIndexOf(".") + 1) + "}}";
} else if (decorated != null) {
name += decorated;
}
if ((apt && !isComposite) || !apt) {
// Don't display Composite parsers in the apt output.
System.out.println(indent(i) + ((apt) ? "* " : "") + name + (isComposite ? " (Composite Parser):" : ""));
if (apt)
System.out.println();
if (includeMimeTypes && !isComposite) {
for (MediaType mt : p.getSupportedTypes(context)) {
System.out.println(indent(i + 3) + ((apt) ? "* " : "") + mt);
if (apt)
System.out.println();
}
}
}
if (isComposite) {
Parser[] subParsers = sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers()));
for (Parser sp : subParsers) {
// Don't indent for Composites in apt.
displayParser(sp, includeMimeTypes, apt, i + ((apt) ? 0 : 3));
}
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class OOXMLParserTest method testExcelXLSB.
@Test
public void testExcelXLSB() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
Metadata m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
// Should be detected correctly
MediaType type;
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
}
// OfficeParser won't handle it
assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser will (soon) handle it
assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// AutoDetectParser doesn't break on it
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
assertContains("This is an example spreadsheet", content);
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class ParserDecoratorTest method withFallback.
/**
* Testing one proposed implementation for TIKA-1509
*/
@Test
public void withFallback() throws Exception {
Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
ParseContext context = new ParseContext();
BodyContentHandler handler;
Metadata metadata;
ErrorParser pFail = new ErrorParser();
DummyParser pWork = new DummyParser(onlyOct, new HashMap<String, String>(), "Fell back!");
EmptyParser pNothing = new EmptyParser();
// Create a combination which will fail first
@SuppressWarnings("deprecation") Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText);
// Will claim to support the types given, not those on the child parsers
Set<MediaType> types = p.getSupportedTypes(context);
assertEquals(2, types.size());
assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN));
assertEquals(types.toString(), true, types.contains(MediaType.OCTET_STREAM));
// Parsing will make it to the second one
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] { 0, 1, 2, 3, 4 }), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
// With a parser that will work with no output, will get nothing
p = ParserDecorator.withFallbacks(Arrays.asList(pNothing, pWork), octAndText);
metadata = new Metadata();
handler = new BodyContentHandler();
p.parse(new ByteArrayInputStream(new byte[] { 0, 1, 2, 3, 4 }), handler, metadata, context);
assertEquals("", handler.toString());
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class MagicDetectorTest method testDetectNull.
@Test
public void testDetectNull() throws Exception {
MediaType html = new MediaType("text", "html");
Detector detector = new MagicDetector(html, "<html".getBytes(US_ASCII));
assertEquals(MediaType.OCTET_STREAM, detector.detect(null, new Metadata()));
}
Aggregations