use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class MboxParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
String charsetName = "windows-1252";
metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
metadata.set(Metadata.CONTENT_ENCODING, charsetName);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
InputStreamReader isr = new InputStreamReader(stream, charsetName);
try (BufferedReader reader = new BufferedReader(isr)) {
String curLine = reader.readLine();
int mailItem = 0;
do {
if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
Metadata mailMetadata = new Metadata();
Queue<String> multiline = new LinkedList<String>();
mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
curLine = reader.readLine();
if (curLine == null) {
break;
}
ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
do {
if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
String latestLine = multiline.poll();
latestLine += " " + curLine.trim();
multiline.add(latestLine);
} else {
multiline.add(curLine);
}
message.write(curLine.getBytes(charsetName));
message.write(0x0A);
curLine = reader.readLine();
} while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
for (String item : multiline) {
saveHeaderInMetadata(mailMetadata, item);
}
ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
message = null;
if (extractor.shouldParseEmbedded(mailMetadata)) {
extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
}
if (tracking) {
getTrackingMetadata().put(mailItem++, mailMetadata);
}
} else {
curLine = reader.readLine();
}
} while (curLine != null && !Thread.currentThread().isInterrupted());
}
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class OutlookPSTParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
TikaInputStream in = TikaInputStream.get(stream);
PSTFile pstFile = null;
try {
pstFile = new PSTFile(in.getFile().getPath());
metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
boolean isValid = pstFile.getFileHandle().getFD().valid();
metadata.set("isValid", valueOf(isValid));
if (isValid) {
parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
}
} catch (Exception e) {
throw new TikaException(e.getMessage(), e);
} finally {
if (pstFile != null && pstFile.getFileHandle() != null) {
try {
pstFile.getFileHandle().close();
} catch (IOException e) {
//swallow closing exception
}
}
}
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class EMFParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentExtractor embeddedDocumentExtractor = null;
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
HemfExtractor ex = new HemfExtractor(stream);
long lastY = -1;
long lastX = -1;
//derive this from the font or frame/bounds information
long fudgeFactorX = 1000;
StringBuilder buffer = new StringBuilder();
for (HemfRecord record : ex) {
if (record.getRecordType() == HemfRecordType.comment) {
AbstractHemfComment comment = ((HemfCommentRecord) record).getComment();
if (comment instanceof HemfCommentPublic.MultiFormats) {
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
handleMultiFormats((HemfCommentPublic.MultiFormats) comment, xhtml, embeddedDocumentExtractor);
} else if (comment instanceof HemfCommentPublic.WindowsMetafile) {
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
handleWMF((HemfCommentPublic.WindowsMetafile) comment, xhtml, embeddedDocumentExtractor);
}
} else if (record.getRecordType().equals(HemfRecordType.exttextoutw)) {
HemfText.ExtTextOutW extTextOutW = (HemfText.ExtTextOutW) record;
if (lastY > -1 && lastY != extTextOutW.getY()) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
xhtml.endElement("p");
buffer.setLength(0);
lastX = -1;
}
if (lastX > -1 && extTextOutW.getX() - lastX > fudgeFactorX) {
buffer.append(" ");
}
String txt = extTextOutW.getText();
buffer.append(txt);
lastY = extTextOutW.getY();
lastX = extTextOutW.getX();
}
}
if (buffer.length() > 0) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
xhtml.endElement("p");
}
} catch (RecordFormatException e) {
//POI's hemfparser can throw these for "parse exceptions"
throw new TikaException(e.getMessage(), e);
} catch (RuntimeException e) {
//convert Runtime to RecordFormatExceptions
throw new TikaException(e.getMessage(), e);
}
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class AbstractOOXMLExtractor method getXHTML.
/**
* @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(ContentHandler, Metadata, ParseContext)
*/
public void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException, XmlException, IOException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
buildXHTML(xhtml);
// Now do any embedded parts
handleEmbeddedParts(handler, metadata);
// thumbnail
handleThumbnail(handler);
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class WMFParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
HwmfPicture picture = new HwmfPicture(stream);
//to determine when to keep two text parts on the same line
for (HwmfRecord record : picture.getRecords()) {
Charset charset = LocaleUtil.CHARSET_1252;
//This fix should be done within POI
if (record.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
HwmfFont font = ((HwmfText.WmfCreateFontIndirect) record).getFont();
charset = (font.getCharSet() == null || font.getCharSet().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset();
}
if (record.getRecordType().equals(HwmfRecordType.extTextOut)) {
HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut) record;
xhtml.startElement("p");
xhtml.characters(textOut.getText(charset));
xhtml.endElement("p");
} else if (record.getRecordType().equals(HwmfRecordType.textOut)) {
HwmfText.WmfTextOut textOut = (HwmfText.WmfTextOut) record;
xhtml.startElement("p");
xhtml.characters(textOut.getText(charset));
xhtml.endElement("p");
}
}
} catch (RecordFormatException e) {
//POI's hwmfparser can throw these for "parse exceptions"
throw new TikaException(e.getMessage(), e);
} catch (RuntimeException e) {
//convert Runtime to RecordFormatExceptions
throw new TikaException(e.getMessage(), e);
} catch (AssertionError e) {
//POI's hwmfparser can throw these for parse exceptions
throw new TikaException(e.getMessage(), e);
}
xhtml.endDocument();
}
Aggregations