use of org.apache.tika.detect.Detector in project tika by apache.
the class TikaConfigSerializer method addDetectors.
private static void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
Detector detector = config.getDetector();
if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
// Don't output anything, all using defaults
Node detComment = doc.createComment("for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
rootElement.appendChild(detComment);
return;
}
Element detectorsElement = doc.createElement("detectors");
if (mode == Mode.CURRENT && detector instanceof DefaultDetector || !(detector instanceof CompositeDetector)) {
Element detectorElement = doc.createElement("detector");
detectorElement.setAttribute("class", detector.getClass().getCanonicalName());
detectorsElement.appendChild(detectorElement);
} else {
List<Detector> children = ((CompositeDetector) detector).getDetectors();
for (Detector d : children) {
Element detectorElement = doc.createElement("detector");
detectorElement.setAttribute("class", d.getClass().getCanonicalName());
detectorsElement.appendChild(detectorElement);
}
}
rootElement.appendChild(detectorsElement);
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class AdvancedTypeDetector method detectWithCustomDetector.
public static String detectWithCustomDetector(String name) throws Exception {
String config = "/org/apache/tika/mime/tika-mimetypes.xml";
Detector detector = MimeTypesFactory.create(config);
Detector custom = new Detector() {
private static final long serialVersionUID = -5420638839201540749L;
public MediaType detect(InputStream input, Metadata metadata) {
String type = metadata.get("my-custom-type-override");
if (type != null) {
return MediaType.parse(type);
} else {
return MediaType.OCTET_STREAM;
}
}
};
Tika tika = new Tika(new CompositeDetector(custom, detector));
return tika.detect(name);
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class TikaDetectorConfigTest method assertDetectors.
private void assertDetectors(CompositeDetector detector, boolean shouldHavePOIFS, boolean shouldHaveZip) {
boolean hasZip = false;
boolean hasPOIFS = false;
for (Detector d : detector.getDetectors()) {
if (d instanceof ZipContainerDetector) {
if (shouldHaveZip) {
hasZip = true;
} else {
fail("Shouldn't have the ZipContainerDetector from config");
}
}
if (d instanceof POIFSContainerDetector) {
if (shouldHavePOIFS) {
hasPOIFS = true;
} else {
fail("Shouldn't have the POIFSContainerDetector from config");
}
}
}
if (shouldHavePOIFS)
assertTrue("Should have the POIFSContainerDetector", hasPOIFS);
if (shouldHaveZip)
assertTrue("Should have the ZipContainerDetector", hasZip);
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class ExcelParserTest method testExcel95.
/**
* Excel 5 and 95 are older formats, and only get basic support
*/
@Test
public void testExcel95() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
MediaType type;
Metadata m;
// First try detection of Excel 5
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
}
// Now Excel 95
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
}
// OfficeParser can handle it
assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser won't handle it
assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// Parse the Excel 5 file
m = new Metadata();
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
// Sheet names
assertContains("Feuil1", content);
assertContains("Feuil3", content);
// Text
assertContains("Sample Excel", content);
assertContains("Number", content);
// Numbers
assertContains("15", content);
assertContains("225", content);
// Metadata was also fetched
assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
}
// Parse the Excel 95 file
m = new Metadata();
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
// Sheet name
assertContains("Foglio1", content);
// Very boring file, no actual text or numbers!
// Metadata was also fetched
assertEquals(null, m.get(TikaCoreProperties.TITLE));
assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
}
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class TikaDetectors method detectorAsMap.
private void detectorAsMap(Detector d, Map<String, Object> details) {
details.put("name", d.getClass().getName());
boolean isComposite = (d instanceof CompositeDetector);
details.put("composite", isComposite);
if (isComposite) {
List<Map<String, Object>> c = new ArrayList<Map<String, Object>>();
for (Detector cd : ((CompositeDetector) d).getDetectors()) {
Map<String, Object> cdet = new HashMap<String, Object>();
detectorAsMap(cd, cdet);
c.add(cdet);
}
details.put("children", c);
}
}
Aggregations