use of org.apache.tika.detect.DefaultDetector in project tika by apache.
the class TikaDetectorConfigTest method testDetectorExcludeFromDefault.
@Test
public void testDetectorExcludeFromDefault() throws Exception {
TikaConfig config = getConfig("TIKA-1702-detector-blacklist.xml");
assertNotNull(config.getParser());
assertNotNull(config.getDetector());
CompositeDetector detector = (CompositeDetector) config.getDetector();
// Should be wrapping two detectors
assertEquals(2, detector.getDetectors().size());
// First should be DefaultDetector, second Empty, that order
assertEquals(DefaultDetector.class, detector.getDetectors().get(0).getClass());
assertEquals(EmptyDetector.class, detector.getDetectors().get(1).getClass());
// Get the DefaultDetector from the config
DefaultDetector confDetector = (DefaultDetector) detector.getDetectors().get(0);
// Get a fresh "default" DefaultParser
DefaultDetector normDetector = new DefaultDetector(config.getMimeRepository());
// The default one will offer the Zip and POIFS detectors
assertDetectors(normDetector, true, true);
// The one from the config won't, as we excluded those
assertDetectors(confDetector, false, false);
}
use of org.apache.tika.detect.DefaultDetector in project tika by apache.
the class TikaConfigSerializer method addDetectors.
private static void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
Detector detector = config.getDetector();
if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
// Don't output anything, all using defaults
Node detComment = doc.createComment("for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
rootElement.appendChild(detComment);
return;
}
Element detectorsElement = doc.createElement("detectors");
if (mode == Mode.CURRENT && detector instanceof DefaultDetector || !(detector instanceof CompositeDetector)) {
Element detectorElement = doc.createElement("detector");
detectorElement.setAttribute("class", detector.getClass().getCanonicalName());
detectorsElement.appendChild(detectorElement);
} else {
List<Detector> children = ((CompositeDetector) detector).getDetectors();
for (Detector d : children) {
Element detectorElement = doc.createElement("detector");
detectorElement.setAttribute("class", d.getClass().getCanonicalName());
detectorsElement.appendChild(detectorElement);
}
}
rootElement.appendChild(detectorsElement);
}
use of org.apache.tika.detect.DefaultDetector in project tika by apache.
the class ExcelParserTest method testExcel95.
/**
* Excel 5 and 95 are older formats, and only get basic support
*/
@Test
public void testExcel95() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
MediaType type;
Metadata m;
// First try detection of Excel 5
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
}
// Now Excel 95
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
}
// OfficeParser can handle it
assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser won't handle it
assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// Parse the Excel 5 file
m = new Metadata();
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
// Sheet names
assertContains("Feuil1", content);
assertContains("Feuil3", content);
// Text
assertContains("Sample Excel", content);
assertContains("Number", content);
// Numbers
assertContains("15", content);
assertContains("225", content);
// Metadata was also fetched
assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
}
// Parse the Excel 95 file
m = new Metadata();
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
// Sheet name
assertContains("Foglio1", content);
// Very boring file, no actual text or numbers!
// Metadata was also fetched
assertEquals(null, m.get(TikaCoreProperties.TITLE));
assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
}
}
use of org.apache.tika.detect.DefaultDetector in project ddf by codice.
the class MimeTypeMapperImpl method guessMimeType.
@Override
public String guessMimeType(InputStream is, String fileExtension) throws MimeTypeResolutionException {
LOGGER.trace("ENTERING: guessMimeType()");
String mimeType = null;
LOGGER.debug("Looping through{} MimeTypeResolvers", mimeTypeResolvers.size());
// This is to force the TikaMimeTypeResolver to be called
// after the CustomMimeTypeResolvers to prevent Tika default mapping
// from being used when a CustomMimeTypeResolver may be more appropriate.
List<MimeTypeResolver> sortedResolvers = sortResolvers(mimeTypeResolvers);
if (StringUtils.isEmpty(fileExtension)) {
try (TemporaryFileBackedOutputStream tfbos = new TemporaryFileBackedOutputStream()) {
IOUtils.copy(is, tfbos);
try (InputStream inputStream = tfbos.asByteSource().openStream()) {
Detector detector = new DefaultDetector();
MediaType mediaType = detector.detect(inputStream, new Metadata());
fileExtension = getFileExtensionForMimeType(mediaType.toString()).replace(".", "");
} finally {
is = tfbos.asByteSource().openStream();
}
} catch (Exception e) {
LOGGER.debug("Failed to guess mimeType for file without extension.");
}
}
// If file has XML extension, then read root element namespace once so
// each MimeTypeResolver does not have to open the stream and read the namespace
String namespace = null;
if (fileExtension.equals(XML_FILE_EXTENSION)) {
try {
namespace = XMLUtils.getRootNamespace(IOUtils.toString(is));
} catch (IOException ioe) {
LOGGER.debug("Could not read namespace from input stream.", ioe);
}
LOGGER.debug("namespace = {}", namespace);
}
// Once a file extension is find for the given mime type, exit the loop.
for (MimeTypeResolver resolver : sortedResolvers) {
LOGGER.debug("Calling MimeTypeResolver {}", resolver.getName());
try {
// an InputTransformer to create a metacard for that "generic" XML file.
if (fileExtension.equals(XML_FILE_EXTENSION)) {
if (namespace != null && resolver.hasSchema()) {
if (namespace.equals(resolver.getSchema())) {
mimeType = resolver.getMimeTypeForFileExtension(fileExtension);
}
}
} else {
mimeType = resolver.getMimeTypeForFileExtension(fileExtension);
}
} catch (Exception e) {
LOGGER.debug("Error resolving mime type for file extension: {}", fileExtension);
throw new MimeTypeResolutionException(e);
}
if (StringUtils.isNotEmpty(mimeType)) {
LOGGER.debug("mimeType [{}] retrieved from MimeTypeResolver: ", mimeType, resolver.getName());
break;
}
}
LOGGER.debug("mimeType = {}, file extension = [{}]", mimeType, fileExtension);
LOGGER.trace("EXITING: guessMimeType()");
return mimeType;
}
Aggregations