use of org.apache.tika.mime.MediaType in project tika by apache.
the class AbstractPOIFSExtractor method handleEmbeddedOfficeDoc.
/**
* Handle an office document that's embedded at the POIFS level
*/
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
if (dir.hasEntry("Package")) {
// It's OOXML (has a ZipFile):
Entry ooxml = dir.getEntry("Package");
try (TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml))) {
ZipContainerDetector detector = new ZipContainerDetector();
MediaType type = null;
try {
//if there's a stream error while detecting...
type = detector.detect(stream, new Metadata());
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true);
return;
}
}
// It's regular OLE2:
// What kind of document is it?
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
if (dir.getStorageClsid() != null) {
metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString());
}
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded = null;
String rName = (resourceName == null) ? dir.getName() : resourceName;
try {
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
}
if (ole.getCommand() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
}
if (ole.getFileName() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
}
byte[] data = ole.getDataBuffer();
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
//TODO: figure out if the equivalent of OLE 1.0's
//getCommand() and getFileName() exist for OLE 2.0 to populate
//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
// Grab the contents and process
DocumentEntry contentsEntry;
try {
contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
} catch (FileNotFoundException ioe) {
contentsEntry = (DocumentEntry) dir.getEntry("Contents");
}
DocumentInputStream inp = new DocumentInputStream(contentsEntry);
byte[] contents = new byte[contentsEntry.getSize()];
inp.readFully(contents);
embedded = TikaInputStream.get(contents);
// Try to work out what it is
MediaType mediaType = getDetector().detect(embedded, new Metadata());
String extension = type.getExtension();
try {
MimeType mimeType = getMimeTypes().forName(mediaType.toString());
extension = mimeType.getExtension();
} catch (MimeTypeException mte) {
// No details on this type are known
}
// Record what we can do about it
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
}
// Should we parse it?
if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
if (embedded == null) {
// Make a TikaInputStream that just
// passes the root directory of the
// embedded document, and is otherwise
// empty (byte[0]):
embedded = TikaInputStream.get(new byte[0]);
embedded.setOpenContainer(dir);
}
embeddedDocumentUtil.parseEmbedded(embedded, xhtml, metadata, true);
}
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
} finally {
if (embedded != null) {
embedded.close();
}
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class HSLFExtractor method handleSlideEmbeddedResources.
private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException {
List<HSLFShape> shapes;
try {
shapes = slide.getShapes();
} catch (NullPointerException e) {
// Sometimes HSLF hits problems
// Please open POI bugs for any you come across!
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
for (HSLFShape shape : shapes) {
if (shape instanceof OLEShape) {
OLEShape oleShape = (OLEShape) shape;
HSLFObjectData data = null;
try {
data = oleShape.getObjectData();
} catch (NullPointerException e) {
/* getObjectData throws NPE some times. */
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
if (data != null) {
String objID = Integer.toString(oleShape.getObjectID());
// Embedded Object: add a <div
// class="embedded" id="X"/> so consumer can see where
// in the main text each embedded document
// occurred:
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", objID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
InputStream dataStream = null;
try {
dataStream = data.getData();
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
String mediaType = null;
if ("Excel.Chart.8".equals(oleShape.getProgID())) {
mediaType = "application/vnd.ms-excel";
} else {
MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
mediaType = mt.toString();
}
if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
}
} else {
handleEmbeddedResource(stream, objID, objID, mediaType, xhtml, false);
}
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
}
}
}
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class TikaParsers method renderParser.
private void renderParser(ParserDetails p, boolean withMimeTypes, StringBuffer text, String indent) {
String nextIndent = indent + " ";
text.append(indent);
text.append(p.className);
if (p.isDecorated) {
text.append(" (Decorated Parser");
if (p.decoratedBy != null) {
text.append(" ").append(p.decoratedBy);
}
text.append(")");
}
if (p.isComposite) {
text.append(" (Composite Parser):\n");
for (Parser cp : p.childParsers) {
renderParser(new ParserDetails(cp), withMimeTypes, text, nextIndent);
}
} else {
text.append("\n");
if (withMimeTypes) {
for (MediaType mt : p.supportedTypes) {
text.append(nextIndent);
text.append("Supports: ");
text.append(mt.toString());
text.append("\n");
}
}
}
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class RTFParserTest method testEmbeddedLinkedDocument.
//TIKA-1010 test linked embedded doc
@Test
public void testEmbeddedLinkedDocument() throws Exception {
Set<MediaType> skipTypes = new HashSet<MediaType>();
skipTypes.add(MediaType.parse("image/emf"));
skipTypes.add(MediaType.parse("image/wmf"));
TrackingHandler tracker = new TrackingHandler(skipTypes);
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
ContainerExtractor ex = new ParserContainerExtractor();
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
//should gracefully skip link and not throw NPE, IOEx, etc
assertEquals(0, tracker.filenames.size());
tracker = new TrackingHandler();
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
ContainerExtractor ex = new ParserContainerExtractor();
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
//should gracefully skip link and not throw NPE, IOEx, etc
assertEquals(2, tracker.filenames.size());
}
use of org.apache.tika.mime.MediaType in project tika by apache.
the class TikaMimeTypes method getMediaTypes.
protected List<MediaTypeDetails> getMediaTypes() {
MediaTypeRegistry registry = TikaResource.getConfig().getMediaTypeRegistry();
Map<MediaType, Parser> parsers = ((CompositeParser) TikaResource.getConfig().getParser()).getParsers();
List<MediaTypeDetails> types = new ArrayList<TikaMimeTypes.MediaTypeDetails>(registry.getTypes().size());
for (MediaType type : registry.getTypes()) {
MediaTypeDetails details = new MediaTypeDetails();
details.type = type;
details.aliases = registry.getAliases(type).toArray(new MediaType[0]);
MediaType supertype = registry.getSupertype(type);
if (supertype != null && !MediaType.OCTET_STREAM.equals(supertype)) {
details.supertype = supertype;
}
Parser p = parsers.get(type);
if (p != null) {
if (p instanceof CompositeParser) {
p = ((CompositeParser) p).getParsers().get(type);
}
details.parser = p.getClass().getName();
}
types.add(details);
}
return types;
}
Aggregations