use of org.apache.tika.detect.Detector in project tika by apache.
the class ForkParserIntegrationTest method testParserHandlingOfNonSerializable.
/**
* If we supply a non serializable object on the ParseContext,
* check we get a helpful exception back
*/
@Test
public void testParserHandlingOfNonSerializable() throws Exception {
ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
ParseContext context = new ParseContext();
context.set(Detector.class, new Detector() {
public MediaType detect(InputStream input, Metadata metadata) {
return MediaType.OCTET_STREAM;
}
});
try {
ContentHandler output = new BodyContentHandler();
InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
parser.parse(stream, output, new Metadata(), context);
fail("Should have blown up with a non serializable ParseContext");
} catch (TikaException e) {
// Check the right details
assertNotNull(e.getCause());
assertEquals(NotSerializableException.class, e.getCause().getClass());
assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
} finally {
parser.close();
}
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class TikaResource method fillMetadata.
@SuppressWarnings("serial")
public static void fillMetadata(Parser parser, Metadata metadata, ParseContext context, MultivaluedMap<String, String> httpHeaders) {
String fileName = detectFilename(httpHeaders);
if (fileName != null) {
metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
}
String contentTypeHeader = httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE);
javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null : javax.ws.rs.core.MediaType.valueOf(contentTypeHeader);
if (mediaType != null && "xml".equals(mediaType.getSubtype())) {
mediaType = null;
}
if (mediaType != null && mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) {
mediaType = null;
}
if (mediaType != null) {
metadata.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, mediaType.toString());
final Detector detector = getDetector(parser);
setDetector(parser, new Detector() {
public MediaType detect(InputStream inputStream, Metadata metadata) throws IOException {
String ct = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
//make sure never to return null -- TIKA-1845
MediaType type = null;
if (ct != null) {
//this can return null if ct is not a valid mime type
type = MediaType.parse(ct);
}
if (type != null) {
return type;
} else {
return detector.detect(inputStream, metadata);
}
}
});
}
final String password = httpHeaders.getFirst("Password");
if (password != null) {
context.set(PasswordProvider.class, new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return password;
}
});
}
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class TikaCLI method displayDetector.
private void displayDetector(Detector d, int i) {
boolean isComposite = (d instanceof CompositeDetector);
String name = d.getClass().getName();
System.out.println(indent(i) + name + (isComposite ? " (Composite Detector):" : ""));
if (isComposite) {
List<Detector> subDetectors = ((CompositeDetector) d).getDetectors();
for (Detector sd : subDetectors) {
displayDetector(sd, i + 2);
}
}
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class OOXMLParserTest method testExcelXLSB.
@Test
public void testExcelXLSB() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
Metadata m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
// Should be detected correctly
MediaType type;
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
}
// OfficeParser won't handle it
assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
// OOXMLParser will (soon) handle it
assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// AutoDetectParser doesn't break on it
try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
parser.parse(input, handler, m, context);
String content = handler.toString();
assertContains("This is an example spreadsheet", content);
}
}
use of org.apache.tika.detect.Detector in project tika by apache.
the class BundleIT method testBundleDetectors.
@Test
public void testBundleDetectors() throws Exception {
//For some reason, the detector created by OSGi has a flat
//list of detectors, whereas the detector created by the traditional
//service loading method has children: DefaultDetector, MimeTypes.
//We have to flatten the service loaded DefaultDetector to get equivalence.
//Detection behavior should all be the same.
// Get the classes found within OSGi
ServiceReference<Detector> detectorRef = bc.getServiceReference(Detector.class);
DefaultDetector detectorService = (DefaultDetector) bc.getService(detectorRef);
Set<String> osgiDetectors = new HashSet<>();
for (Detector d : detectorService.getDetectors()) {
osgiDetectors.add(d.getClass().getName());
}
// Check we did get a few, just in case...
assertTrue("Should have several Detector names, found " + osgiDetectors.size(), osgiDetectors.size() > 3);
// Get the raw detectors list from the traditional service loading mechanism
DefaultDetector detector = new DefaultDetector();
Set<String> rawDetectors = new HashSet<String>();
for (Detector d : detector.getDetectors()) {
if (d instanceof DefaultDetector) {
for (Detector dChild : ((DefaultDetector) d).getDetectors()) {
rawDetectors.add(dChild.getClass().getName());
}
} else {
rawDetectors.add(d.getClass().getName());
}
}
assertEquals(osgiDetectors, rawDetectors);
}
Aggregations