use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class OutlookPSTParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
TikaInputStream in = TikaInputStream.get(stream);
PSTFile pstFile = null;
try {
pstFile = new PSTFile(in.getFile().getPath());
metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
boolean isValid = pstFile.getFileHandle().getFD().valid();
metadata.set("isValid", valueOf(isValid));
if (isValid) {
parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
}
} catch (Exception e) {
throw new TikaException(e.getMessage(), e);
} finally {
if (pstFile != null && pstFile.getFileHandle() != null) {
try {
pstFile.getFileHandle().close();
} catch (IOException e) {
//swallow closing exception
}
}
}
xhtml.endDocument();
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class EMFParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentExtractor embeddedDocumentExtractor = null;
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
HemfExtractor ex = new HemfExtractor(stream);
long lastY = -1;
long lastX = -1;
//derive this from the font or frame/bounds information
long fudgeFactorX = 1000;
StringBuilder buffer = new StringBuilder();
for (HemfRecord record : ex) {
if (record.getRecordType() == HemfRecordType.comment) {
AbstractHemfComment comment = ((HemfCommentRecord) record).getComment();
if (comment instanceof HemfCommentPublic.MultiFormats) {
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
handleMultiFormats((HemfCommentPublic.MultiFormats) comment, xhtml, embeddedDocumentExtractor);
} else if (comment instanceof HemfCommentPublic.WindowsMetafile) {
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
handleWMF((HemfCommentPublic.WindowsMetafile) comment, xhtml, embeddedDocumentExtractor);
}
} else if (record.getRecordType().equals(HemfRecordType.exttextoutw)) {
HemfText.ExtTextOutW extTextOutW = (HemfText.ExtTextOutW) record;
if (lastY > -1 && lastY != extTextOutW.getY()) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
xhtml.endElement("p");
buffer.setLength(0);
lastX = -1;
}
if (lastX > -1 && extTextOutW.getX() - lastX > fudgeFactorX) {
buffer.append(" ");
}
String txt = extTextOutW.getText();
buffer.append(txt);
lastY = extTextOutW.getY();
lastX = extTextOutW.getX();
}
}
if (buffer.length() > 0) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
xhtml.endElement("p");
}
} catch (RecordFormatException e) {
//POI's hemfparser can throw these for "parse exceptions"
throw new TikaException(e.getMessage(), e);
} catch (RuntimeException e) {
//convert Runtime to RecordFormatExceptions
throw new TikaException(e.getMessage(), e);
}
xhtml.endDocument();
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class TNEFParser method parse.
/**
* Extracts properties and text from an MS Document input stream
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// We work by recursing, so get the appropriate bits
EmbeddedDocumentExtractor embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
// Ask POI to process the file for us
HMEFMessage msg = new HMEFMessage(stream);
// Set the message subject if known
String subject = msg.getSubject();
if (subject != null && subject.length() > 0) {
// TODO: Move to title in Tika 2.0
metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, subject);
}
// Recurse into the message body RTF
MAPIAttribute attr = msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
if (attr != null && attr instanceof MAPIRtfAttribute) {
MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr;
handleEmbedded("message.rtf", "application/rtf", rtf.getData(), embeddedExtractor, handler);
}
// Recurse into each attachment in turn
for (Attachment attachment : msg.getAttachments()) {
String name = attachment.getLongFilename();
if (name == null || name.length() == 0) {
name = attachment.getFilename();
}
if (name == null || name.length() == 0) {
String ext = attachment.getExtension();
if (ext != null) {
name = "unknown" + ext;
}
}
handleEmbedded(name, null, attachment.getContents(), embeddedExtractor, handler);
}
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class TSDParser method parseTSDContent.
private void parseTSDContent(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) {
CMSTimeStampedDataParser cmsTimeStampedDataParser = null;
EmbeddedDocumentExtractor edx = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (edx.shouldParseEmbedded(metadata)) {
try {
cmsTimeStampedDataParser = new CMSTimeStampedDataParser(stream);
try (InputStream is = TikaInputStream.get(cmsTimeStampedDataParser.getContent())) {
edx.parseEmbedded(is, handler, metadata, false);
}
} catch (Exception ex) {
LOG.error("Error in TSDParser.parseTSDContent {}", ex.getMessage());
} finally {
this.closeCMSParser(cmsTimeStampedDataParser);
}
}
}
use of org.apache.tika.extractor.EmbeddedDocumentExtractor in project tika by apache.
the class MockParser method getEmbeddedDocumentExtractor.
protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class);
if (extractor == null) {
Parser p = context.get(Parser.class);
if (p == null) {
context.set(Parser.class, new MockParser());
}
extractor = new ParsingEmbeddedDocumentExtractor(context);
}
return extractor;
}
Aggregations