use of org.apache.tika.detect.DefaultDetector in project tika by apache.
the class RFC822ParserTest method testExtractAttachments.
@Test
public void testExtractAttachments() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
Parser p = new RFC822Parser();
ParseContext context = new ParseContext();
try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
p.parse(stream, handler, metadata, context);
}
// Check we go the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
// Try again with attachment detecting and fetching
final Detector detector = new DefaultDetector();
final Parser extParser = new AutoDetectParser();
final List<MediaType> seenTypes = new ArrayList<MediaType>();
final List<String> seenText = new ArrayList<String>();
EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() {
@Override
public boolean shouldParseEmbedded(Metadata metadata) {
return true;
}
@Override
public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
seenTypes.add(detector.detect(stream, metadata));
ContentHandler h = new BodyContentHandler();
try {
extParser.parse(stream, h, metadata, new ParseContext());
} catch (TikaException e) {
throw new RuntimeException(e);
}
seenText.add(h.toString());
}
};
context.set(EmbeddedDocumentExtractor.class, ext);
try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
p.parse(stream, handler, metadata, context);
}
// Check we go the metadata
assertEquals("Tika Test <XXXX@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
// Check attachments
assertEquals(2, seenTypes.size());
assertEquals(2, seenText.size());
assertEquals("text/plain", seenTypes.get(0).toString());
assertEquals("image/png", seenTypes.get(1).toString());
assertEquals("This email has a PNG attachment included in it\n\n", seenText.get(0));
}
use of org.apache.tika.detect.DefaultDetector in project tika by apache.
the class OOXMLParserTest method testExcelXLSB.
@Test
public void testExcelXLSB() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
Metadata m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
// Should be detected correctly
MediaType type;
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
}
// OfficeParser won't handle it
assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser will (soon) handle it
assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// AutoDetectParser doesn't break on it
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
assertContains("This is an example spreadsheet", content);
}
}
use of org.apache.tika.detect.DefaultDetector in project tika by apache.
the class Activator method start.
@Override
public void start(BundleContext context) throws Exception {
detectorService = context.registerService(Detector.class.getName(), new DefaultDetector(Activator.class.getClassLoader()), new Properties());
Parser parser = new DefaultParser(Activator.class.getClassLoader());
parserService = context.registerService(Parser.class.getName(), parser, new Properties());
}
use of org.apache.tika.detect.DefaultDetector in project tika by apache.
the class BundleIT method testBundleDetectors.
@Test
public void testBundleDetectors() throws Exception {
//For some reason, the detector created by OSGi has a flat
//list of detectors, whereas the detector created by the traditional
//service loading method has children: DefaultDetector, MimeTypes.
//We have to flatten the service loaded DefaultDetector to get equivalence.
//Detection behavior should all be the same.
// Get the classes found within OSGi
ServiceReference<Detector> detectorRef = bc.getServiceReference(Detector.class);
DefaultDetector detectorService = (DefaultDetector) bc.getService(detectorRef);
Set<String> osgiDetectors = new HashSet<>();
for (Detector d : detectorService.getDetectors()) {
osgiDetectors.add(d.getClass().getName());
}
// Check we did get a few, just in case...
assertTrue("Should have several Detector names, found " + osgiDetectors.size(), osgiDetectors.size() > 3);
// Get the raw detectors list from the traditional service loading mechanism
DefaultDetector detector = new DefaultDetector();
Set<String> rawDetectors = new HashSet<String>();
for (Detector d : detector.getDetectors()) {
if (d instanceof DefaultDetector) {
for (Detector dChild : ((DefaultDetector) d).getDetectors()) {
rawDetectors.add(dChild.getClass().getName());
}
} else {
rawDetectors.add(d.getClass().getName());
}
}
assertEquals(osgiDetectors, rawDetectors);
}
use of org.apache.tika.detect.DefaultDetector in project uPortal by Jasig.
the class JaxbPortalDataHandlerService method getMediaType.
protected MediaType getMediaType(BufferedInputStream inputStream, String fileName) throws IOException {
final TikaInputStream tikaInputStreamStream = TikaInputStream.get(new CloseShieldInputStream(inputStream));
try {
final Detector detector = new DefaultDetector();
final Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
final MediaType type = detector.detect(tikaInputStreamStream, metadata);
logger.debug("Determined '{}' for '{}'", type, fileName);
return type;
} catch (IOException e) {
logger.warn("Failed to determine media type for '" + fileName + "' assuming XML", e);
return null;
} finally {
IOUtils.closeQuietly(tikaInputStreamStream);
//Reset the buffered stream to make up for anything read by the detector
inputStream.reset();
}
}
Aggregations