use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class MailContentHandler method body.
public void body(BodyDescriptor body, InputStream is) throws MimeException, IOException {
// use a different metadata object
// in order to specify the mime type of the
// sub part without damaging the main metadata
Metadata submd = new Metadata();
submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
try {
if (extractor.shouldParseEmbedded(submd)) {
// Wrap the InputStream before passing on, as the James provided
// one misses many features we might want eg mark/reset
TikaInputStream tis = TikaInputStream.get(is);
extractor.parseEmbedded(tis, handler, submd, false);
}
} catch (SAXException e) {
throw new MimeException(e);
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class RFC822Parser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Get the mime4j configuration, or use a default one
MimeConfig config = new MimeConfig();
config.setMaxLineLen(100000);
// max length of any individual header
config.setMaxHeaderLen(100000);
config = context.get(MimeConfig.class, config);
MimeStreamParser parser = new MimeStreamParser(config);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
MailContentHandler mch = new MailContentHandler(xhtml, metadata, context, config.isStrictParsing());
parser.setContentHandler(mch);
parser.setContentDecoding(true);
TikaInputStream tstream = TikaInputStream.get(stream);
try {
parser.parse(tstream);
} catch (IOException e) {
tstream.throwIfCauseOf(e);
throw new TikaException("Failed to parse an email message", e);
} catch (MimeException e) {
// Unwrap the exception in case it was not thrown by mime4j
Throwable cause = e.getCause();
if (cause instanceof TikaException) {
throw (TikaException) cause;
} else if (cause instanceof SAXException) {
throw (SAXException) cause;
} else {
throw new TikaException("Failed to parse an email message", e);
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class MatParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
//Set MIME type as Matlab
metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
try {
// Use TIS so we can spool a temp file for parsing.
TikaInputStream tis = TikaInputStream.get(stream, tmp);
//Extract information from header file
//input .mat file
MatFileReader mfr = new MatFileReader(tis.getFile());
//.mat header information
MatFileHeader hdr = mfr.getMatFileHeader();
// Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar 2 23:41:57 2014"
// Break header information into its parts
String[] parts = hdr.getDescription().split(",");
if (parts[2].contains("Created")) {
int lastIndex1 = parts[2].lastIndexOf("Created on:");
String dateCreated = parts[2].substring(lastIndex1 + "Created on:".length()).trim();
metadata.set("createdOn", dateCreated);
}
if (parts[1].contains("Platform")) {
int lastIndex2 = parts[1].lastIndexOf("Platform:");
String platform = parts[1].substring(lastIndex2 + "Platform:".length()).trim();
metadata.set("platform", platform);
}
if (parts[0].contains("MATLAB")) {
metadata.set("fileType", parts[0]);
}
// Get endian indicator from header file
// Retrieve endian bytes and convert to string
String endianBytes = new String(hdr.getEndianIndicator(), UTF_8);
// Convert bytes to characters to string
String endianCode = String.valueOf(endianBytes.toCharArray());
metadata.set("endian", endianCode);
//Text output
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.newline();
//Loop through each variable
for (Map.Entry<String, MLArray> entry : mfr.getContent().entrySet()) {
String varName = entry.getKey();
MLArray varData = entry.getValue();
xhtml.element("p", varName + ":" + String.valueOf(varData));
// If the variable is a structure, extract variable info from structure
if (varData.isStruct()) {
MLStructure mlStructure = (MLStructure) mfr.getMLArray(varName);
xhtml.startElement("ul");
xhtml.newline();
for (MLArray element : mlStructure.getAllFields()) {
xhtml.startElement("li");
xhtml.characters(String.valueOf(element));
// If there is an embedded structure, extract variable info.
if (element.isStruct()) {
xhtml.startElement("ul");
// Should this actually be a recursive call?
xhtml.element("li", element.contentToString());
xhtml.endElement("ul");
}
xhtml.endElement("li");
}
xhtml.endElement("ul");
}
}
xhtml.endDocument();
} catch (IOException e) {
throw new TikaException("Error parsing Matlab file with MatParser", e);
} finally {
if (tmp != null) {
tmp.dispose();
}
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class POIFSContainerDetector method detect.
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
// Check if we have access to the document
if (input == null) {
return MediaType.OCTET_STREAM;
}
// If this is a TikaInputStream wrapping an already
// parsed NPOIFileSystem/DirectoryNode, just get the
// names from the root:
TikaInputStream tis = TikaInputStream.cast(input);
Set<String> names = null;
if (tis != null) {
Object container = tis.getOpenContainer();
if (container instanceof NPOIFSFileSystem) {
names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
} else if (container instanceof DirectoryNode) {
names = getTopLevelNames((DirectoryNode) container);
}
}
if (names == null) {
// Check if the document starts with the OLE header
input.mark(8);
try {
if (input.read() != 0xd0 || input.read() != 0xcf || input.read() != 0x11 || input.read() != 0xe0 || input.read() != 0xa1 || input.read() != 0xb1 || input.read() != 0x1a || input.read() != 0xe1) {
return MediaType.OCTET_STREAM;
}
} finally {
input.reset();
}
}
// We can only detect the exact type when given a TikaInputStream
if (names == null && tis != null) {
// Look for known top level entry names to detect the document type
names = getTopLevelNames(tis);
}
// Detect based on the names (as available)
if (tis != null && tis.getOpenContainer() != null && tis.getOpenContainer() instanceof NPOIFSFileSystem) {
return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
} else {
return detect(names, null);
}
}
use of org.apache.tika.io.TikaInputStream in project tika by apache.
the class HSLFExtractor method handleSlideEmbeddedPictures.
private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException {
for (HSLFPictureData pic : slideshow.getPictureData()) {
String mediaType;
switch(pic.getType()) {
case EMF:
mediaType = "image/emf";
break;
case WMF:
mediaType = "image/wmf";
break;
case DIB:
mediaType = "image/bmp";
break;
default:
mediaType = pic.getContentType();
break;
}
byte[] data = null;
try {
data = pic.getData();
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
continue;
}
try (TikaInputStream picIs = TikaInputStream.get(data)) {
handleEmbeddedResource(picIs, null, null, mediaType, xhtml, false);
}
}
}
Aggregations