use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class OutlookPSTParser method parseMailAttachments.
private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor) throws TikaException {
int numberOfAttachments = email.getNumberOfAttachments();
for (int i = 0; i < numberOfAttachments; i++) {
File tempFile = null;
try {
PSTAttachment attach = email.getAttachment(i);
// Get the filename; both long and short filenames can be used for attachments
String filename = attach.getLongFilename();
if (filename.isEmpty()) {
filename = attach.getFilename();
}
xhtml.element("p", filename);
Metadata attachMeta = new Metadata();
attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", filename);
xhtml.startElement("div", attributes);
if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
} finally {
tmp.dispose();
}
}
xhtml.endElement("div");
} catch (Exception e) {
throw new TikaException("Unable to unpack document stream", e);
} finally {
if (tempFile != null)
tempFile.delete();
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class OutlookPSTParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
TikaInputStream in = TikaInputStream.get(stream);
PSTFile pstFile = null;
try {
pstFile = new PSTFile(in.getFile().getPath());
metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
boolean isValid = pstFile.getFileHandle().getFD().valid();
metadata.set("isValid", valueOf(isValid));
if (isValid) {
parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
}
} catch (Exception e) {
throw new TikaException(e.getMessage(), e);
} finally {
if (pstFile != null && pstFile.getFileHandle() != null) {
try {
pstFile.getFileHandle().close();
} catch (IOException e) {
//swallow closing exception
}
}
}
xhtml.endDocument();
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class AbstractPOIFSExtractor method handleEmbeddedOfficeDoc.
/**
* Handle an office document that's embedded at the POIFS level
*/
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
if (dir.hasEntry("Package")) {
// It's OOXML (has a ZipFile):
Entry ooxml = dir.getEntry("Package");
try (TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml))) {
ZipContainerDetector detector = new ZipContainerDetector();
MediaType type = null;
try {
//if there's a stream error while detecting...
type = detector.detect(stream, new Metadata());
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true);
return;
}
}
// It's regular OLE2:
// What kind of document is it?
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
if (dir.getStorageClsid() != null) {
metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString());
}
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded = null;
String rName = (resourceName == null) ? dir.getName() : resourceName;
try {
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
}
if (ole.getCommand() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
}
if (ole.getFileName() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
}
byte[] data = ole.getDataBuffer();
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
//TODO: figure out if the equivalent of OLE 1.0's
//getCommand() and getFileName() exist for OLE 2.0 to populate
//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
// Grab the contents and process
DocumentEntry contentsEntry;
try {
contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
} catch (FileNotFoundException ioe) {
contentsEntry = (DocumentEntry) dir.getEntry("Contents");
}
DocumentInputStream inp = new DocumentInputStream(contentsEntry);
byte[] contents = new byte[contentsEntry.getSize()];
inp.readFully(contents);
embedded = TikaInputStream.get(contents);
// Try to work out what it is
MediaType mediaType = getDetector().detect(embedded, new Metadata());
String extension = type.getExtension();
try {
MimeType mimeType = getMimeTypes().forName(mediaType.toString());
extension = mimeType.getExtension();
} catch (MimeTypeException mte) {
// No details on this type are known
}
// Record what we can do about it
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
}
// Should we parse it?
if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
if (embedded == null) {
// Make a TikaInputStream that just
// passes the root directory of the
// embedded document, and is otherwise
// empty (byte[0]):
embedded = TikaInputStream.get(new byte[0]);
embedded.setOpenContainer(dir);
}
embeddedDocumentUtil.parseEmbedded(embedded, xhtml, metadata, true);
}
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
} finally {
if (embedded != null) {
embedded.close();
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class WordExtractor method handlePictureCharacterRun.
private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
if (!isRendered(cr) || picture == null) {
// the same real image
return;
}
// Which one is it?
String extension = picture.suggestFileExtension();
int pictureNumber = pictures.pictureNumber(picture);
// Make up a name for the picture
// There isn't one in the file, but we need to be able to reference
// the picture from the img tag and the embedded resource
String filename = "image" + pictureNumber + (extension.length() > 0 ? "." + extension : "");
// Grab the mime type for the picture
String mimeType = picture.getMimeType();
// Output the img tag
AttributesImpl attr = new AttributesImpl();
attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename);
attr.addAttribute("", "alt", "alt", "CDATA", filename);
xhtml.startElement("img", attr);
xhtml.endElement("img");
// (Only expose each individual image once)
if (!pictures.hasOutput(picture)) {
TikaInputStream stream = TikaInputStream.get(picture.getContent());
handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
pictures.recordOutput(picture);
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class HSLFExtractor method handleSlideEmbeddedResources.
private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException {
List<HSLFShape> shapes;
try {
shapes = slide.getShapes();
} catch (NullPointerException e) {
// Sometimes HSLF hits problems
// Please open POI bugs for any you come across!
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
for (HSLFShape shape : shapes) {
if (shape instanceof OLEShape) {
OLEShape oleShape = (OLEShape) shape;
HSLFObjectData data = null;
try {
data = oleShape.getObjectData();
} catch (NullPointerException e) {
/* getObjectData throws NPE some times. */
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
if (data != null) {
String objID = Integer.toString(oleShape.getObjectID());
// Embedded Object: add a <div
// class="embedded" id="X"/> so consumer can see where
// in the main text each embedded document
// occurred:
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", objID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
InputStream dataStream = null;
try {
dataStream = data.getData();
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
String mediaType = null;
if ("Excel.Chart.8".equals(oleShape.getProgID())) {
mediaType = "application/vnd.ms-excel";
} else {
MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
mediaType = mt.toString();
}
if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")) {
try (NPOIFSFileSystem npoifs = new NPOIFSFileSystem(new CloseShieldInputStream(stream))) {
handleEmbeddedOfficeDoc(npoifs.getRoot(), objID, xhtml);
}
} else {
handleEmbeddedResource(stream, objID, objID, mediaType, xhtml, false);
}
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
}
}
}
}
}
Aggregations