use of org.apache.tika.exception.TikaException in project tika by apache.
the class MatParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//Set MIME type as Matlab
metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
try {
// Use TIS so we can spool a temp file for parsing.
TikaInputStream tis = TikaInputStream.get(stream, tmp);
//Extract information from header file
//input .mat file
MatFileReader mfr = new MatFileReader(tis.getFile());
//.mat header information
MatFileHeader hdr = mfr.getMatFileHeader();
// Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar 2 23:41:57 2014"
// Break header information into its parts
String[] parts = hdr.getDescription().split(",");
if (parts[2].contains("Created")) {
int lastIndex1 = parts[2].lastIndexOf("Created on:");
String dateCreated = parts[2].substring(lastIndex1 + "Created on:".length()).trim();
metadata.set("createdOn", dateCreated);
}
if (parts[1].contains("Platform")) {
int lastIndex2 = parts[1].lastIndexOf("Platform:");
String platform = parts[1].substring(lastIndex2 + "Platform:".length()).trim();
metadata.set("platform", platform);
}
if (parts[0].contains("MATLAB")) {
metadata.set("fileType", parts[0]);
}
// Get endian indicator from header file
// Retrieve endian bytes and convert to string
String endianBytes = new String(hdr.getEndianIndicator(), UTF_8);
// Convert bytes to characters to string
String endianCode = String.valueOf(endianBytes.toCharArray());
metadata.set("endian", endianCode);
//Text output
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.newline();
//Loop through each variable
for (Map.Entry<String, MLArray> entry : mfr.getContent().entrySet()) {
String varName = entry.getKey();
MLArray varData = entry.getValue();
xhtml.element("p", varName + ":" + String.valueOf(varData));
// If the variable is a structure, extract variable info from structure
if (varData.isStruct()) {
MLStructure mlStructure = (MLStructure) mfr.getMLArray(varName);
xhtml.startElement("ul");
xhtml.newline();
for (MLArray element : mlStructure.getAllFields()) {
xhtml.startElement("li");
xhtml.characters(String.valueOf(element));
// If there is an embedded structure, extract variable info.
if (element.isStruct()) {
xhtml.startElement("ul");
// Should this actually be a recursive call?
xhtml.element("li", element.contentToString());
xhtml.endElement("ul");
}
xhtml.endElement("li");
}
xhtml.endElement("ul");
}
}
xhtml.endDocument();
} catch (IOException e) {
throw new TikaException("Error parsing Matlab file with MatParser", e);
} finally {
if (tmp != null) {
tmp.dispose();
}
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class OutlookExtractor method parse.
public void parse(XHTMLContentHandler xhtml, Metadata metadata) throws TikaException, SAXException, IOException {
try {
msg.setReturnNullOnMissingChunk(true);
try {
metadata.set(Office.MAPI_MESSAGE_CLASS, getMessageClass(msg.getMessageClass()));
} catch (ChunkNotFoundException e) {
}
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
guess7BitEncoding(msg);
}
// Start with the metadata
String subject = msg.getSubject();
Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());
String from = msg.getDisplayFrom();
handleFromTo(headers, metadata);
metadata.set(TikaCoreProperties.TITLE, subject);
// TODO: Move to description in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, msg.getConversationTopic());
try {
for (String recipientAddress : msg.getRecipientEmailAddressList()) {
if (recipientAddress != null)
metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
}
} catch (ChunkNotFoundException he) {
}
for (Map.Entry<String, String[]> e : headers.entrySet()) {
String headerKey = e.getKey();
for (String headerValue : e.getValue()) {
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
}
}
// First try via the proper chunk
if (msg.getMessageDate() != null) {
metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
} else {
if (headers != null && headers.size() > 0) {
for (Map.Entry<String, String[]> header : headers.entrySet()) {
String headerKey = header.getKey();
if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();
// See if we can parse it as a normal mail date
try {
Date d = MboxParser.parseDate(date);
metadata.set(TikaCoreProperties.CREATED, d);
metadata.set(TikaCoreProperties.MODIFIED, d);
} catch (ParseException e) {
// Store it as-is, and hope for the best...
metadata.set(TikaCoreProperties.CREATED, date);
metadata.set(TikaCoreProperties.MODIFIED, date);
}
break;
}
}
}
}
xhtml.element("h1", subject);
// Output the from and to details in text, as you
// often want them in text form for searching
xhtml.startElement("dl");
if (from != null) {
header(xhtml, "From", from);
}
header(xhtml, "To", msg.getDisplayTo());
header(xhtml, "Cc", msg.getDisplayCC());
header(xhtml, "Bcc", msg.getDisplayBCC());
try {
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
} catch (ChunkNotFoundException e) {
}
xhtml.endElement("dl");
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
Chunk rtfChunk = null;
Chunk textChunk = null;
for (Chunk chunk : msg.getMainChunks().getChunks()) {
if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
htmlChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
rtfChunk = chunk;
}
if (chunk.getChunkId() == MAPIProperty.BODY.id) {
textChunk = chunk;
}
}
boolean doneBody = false;
xhtml.startElement("div", "class", "message-body");
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
data = ((ByteChunk) htmlChunk).getValue();
} else if (htmlChunk instanceof StringChunk) {
data = ((StringChunk) htmlChunk).getRawValue();
}
if (data != null) {
Parser htmlParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
if (htmlParser == null) {
htmlParser = new HtmlParser();
}
htmlParser.parse(new ByteArrayInputStream(data), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
}
if (rtfChunk != null && !doneBody) {
ByteChunk chunk = (ByteChunk) rtfChunk;
MAPIRtfAttribute rtf = new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue());
Parser rtfParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
rtfParser.parse(new ByteArrayInputStream(rtf.getData()), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext);
doneBody = true;
}
if (textChunk != null && !doneBody) {
xhtml.element("p", ((StringChunk) textChunk).getValue());
}
xhtml.endElement("div");
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
xhtml.startElement("div", "class", "attachment-entry");
String filename = null;
if (attachment.getAttachLongFileName() != null) {
filename = attachment.getAttachLongFileName().getValue();
} else if (attachment.getAttachFileName() != null) {
filename = attachment.getAttachFileName().getValue();
}
if (filename != null && filename.length() > 0) {
xhtml.element("h1", filename);
}
if (attachment.getAttachData() != null) {
handleEmbeddedResource(TikaInputStream.get(attachment.getAttachData().getValue()), filename, null, null, xhtml, true);
}
if (attachment.getAttachmentDirectory() != null) {
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), xhtml);
}
xhtml.endElement("div");
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
} finally {
//You'd think you'd want to call msg.close().
//Don't do that. That closes down the file system.
//If an msg has multiple msg attachments, some of them
//can reside in the same file system. After the first
//child is read, the fs is closed, and the other children
//get a java.nio.channels.ClosedChannelException
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class HSLFExtractor method handleSlideEmbeddedPictures.
private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException {
for (HSLFPictureData pic : slideshow.getPictureData()) {
String mediaType;
switch(pic.getType()) {
case EMF:
mediaType = "image/emf";
break;
case WMF:
mediaType = "image/wmf";
break;
case DIB:
mediaType = "image/bmp";
break;
default:
mediaType = pic.getContentType();
break;
}
byte[] data = null;
try {
data = pic.getData();
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
try (TikaInputStream picIs = TikaInputStream.get(data)) {
handleEmbeddedResource(picIs, null, null, mediaType, xhtml, false);
}
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class SXSLFPowerPointExtractorDecorator method handleSlidePart.
private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) throws IOException, SAXException {
Map<String, String> linkedRelationships = loadLinkedRelationships(slidePart, false, metadata);
// Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
xhtml.startElement("div", "class", "slide-content");
try (InputStream stream = slidePart.getInputStream()) {
context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))));
} catch (TikaException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
xhtml.endElement("div");
handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(), "slide-master-content", slidePart, new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships)));
handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(), "slide-notes", slidePart, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(), "slide-notes-master", slidePart, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(), null, slidePart, new XSLFCommentsHandler(xhtml));
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class SXSLFPowerPointExtractorDecorator method handleBasicRelatedParts.
/**
* This should handle the comments, master, notes, etc
*
* @param contentType
* @param xhtmlClassLabel
* @param parentPart
* @param contentHandler
*/
private void handleBasicRelatedParts(String contentType, String xhtmlClassLabel, PackagePart parentPart, ContentHandler contentHandler) throws SAXException {
PackageRelationshipCollection relatedPartPRC = null;
try {
relatedPartPRC = parentPart.getRelationshipsByType(contentType);
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
if (relatedPartPRC != null && relatedPartPRC.size() > 0) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", xhtmlClassLabel);
contentHandler.startElement("", "div", "div", attributes);
for (int i = 0; i < relatedPartPRC.size(); i++) {
PackageRelationship relatedPartPackageRelationship = relatedPartPRC.getRelationship(i);
try {
PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship);
try (InputStream stream = relatedPartPart.getInputStream()) {
context.getSAXParser().parse(stream, new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)));
} catch (IOException | TikaException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
}
contentHandler.endElement("", "div", "div");
}
}
Aggregations